diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,68034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 270.0625, + "completions/mean_terminated_length": 270.0625, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.11410666164010763, + "epoch": 0.002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6294371485710144, + "learning_rate": 0.0, + "loss": 0.0098, + "num_tokens": 8674.0, + "reward": 1.6712751388549805, + "reward_std": 11.362724304199219, + "rewards/fitness_reward/mean": 1.533573865890503, + "rewards/fitness_reward/std": 5.61301851272583, + "rewards/kidney_reward/mean": -0.04256998002529144, + "rewards/kidney_reward/std": 2.75425124168396, + "rewards/length2tails_reward/mean": 0.601272702217102, + "rewards/length2tails_reward/std": 0.4386332929134369, + "rewards/repeated_in_batch_reward/mean": 0.78125, + "rewards/repeated_in_batch_reward/std": 0.420013427734375, + "rewards/thermo_reward/mean": 0.042019158601760864, + "rewards/thermo_reward/std": 3.2507805824279785, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.5, + "completions/mean_terminated_length": 269.5, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0807526521384716, + "epoch": 0.004, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29795771837234497, + "learning_rate": 4e-08, + "loss": 0.0027, + "num_tokens": 17330.0, + "reward": 3.9355084896087646, + "reward_std": 10.647903442382812, + "rewards/fitness_reward/mean": 2.646754741668701, + "rewards/fitness_reward/std": 5.466333866119385, + "rewards/kidney_reward/mean": 0.5920517444610596, + "rewards/kidney_reward/std": 2.4331250190734863, + "rewards/length2tails_reward/mean": 0.5398199558258057, + "rewards/length2tails_reward/std": 0.40852677822113037, + "rewards/repeated_in_batch_reward/mean": 0.84375, + "rewards/repeated_in_batch_reward/std": 0.3689020276069641, + "rewards/thermo_reward/mean": 0.5583449602127075, + "rewards/thermo_reward/std": 3.1016311645507812, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 266.875, + "completions/mean_terminated_length": 266.875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.0851337956264615, + "epoch": 0.006, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.43736904859542847, + "learning_rate": 8e-08, + "loss": -0.0293, + "num_tokens": 25902.0, + "reward": 1.0216810703277588, + "reward_std": 11.25409984588623, + "rewards/fitness_reward/mean": 1.7421739101409912, + "rewards/fitness_reward/std": 5.732826232910156, + "rewards/kidney_reward/mean": -0.23049303889274597, + "rewards/kidney_reward/std": 2.7518129348754883, + "rewards/length2tails_reward/mean": 0.7120003700256348, + "rewards/length2tails_reward/std": 0.3755108118057251, + "rewards/repeated_in_batch_reward/mean": 0.90625, + "rewards/repeated_in_batch_reward/std": 0.2961445748806, + "rewards/thermo_reward/mean": -0.6518245935440063, + "rewards/thermo_reward/std": 3.3029723167419434, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 266.9375, + "completions/mean_terminated_length": 266.9375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.08211329486221075, + "epoch": 0.008, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2764379680156708, + "learning_rate": 1.2e-07, + "loss": -0.0176, + "num_tokens": 34476.0, + "reward": 3.567150831222534, + "reward_std": 10.611852645874023, + "rewards/fitness_reward/mean": 2.5539772510528564, + "rewards/fitness_reward/std": 5.466435432434082, + "rewards/kidney_reward/mean": 0.4766693413257599, + "rewards/kidney_reward/std": 2.4557559490203857, + "rewards/length2tails_reward/mean": 0.5894806385040283, + "rewards/length2tails_reward/std": 0.4003503620624542, + "rewards/repeated_in_batch_reward/mean": 0.8125, + "rewards/repeated_in_batch_reward/std": 0.3965577781200409, + "rewards/thermo_reward/mean": 0.3963066339492798, + "rewards/thermo_reward/std": 3.1332244873046875, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.59375, + "completions/mean_terminated_length": 269.59375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.07415536837652326, + "epoch": 0.01, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22633300721645355, + "learning_rate": 1.6e-07, + "loss": 0.0054, + "num_tokens": 43135.0, + "reward": -0.5113043785095215, + "reward_std": 10.228713035583496, + "rewards/fitness_reward/mean": 0.3053843379020691, + "rewards/fitness_reward/std": 5.22194766998291, + "rewards/kidney_reward/mean": -0.2857877314090729, + "rewards/kidney_reward/std": 2.4033970832824707, + "rewards/length2tails_reward/mean": 0.5589621067047119, + "rewards/length2tails_reward/std": 0.44231438636779785, + "rewards/repeated_in_batch_reward/mean": 0.8125, + "rewards/repeated_in_batch_reward/std": 0.3965577781200409, + "rewards/thermo_reward/mean": -0.6680471897125244, + "rewards/thermo_reward/std": 2.894864320755005, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.46875, + "completions/mean_terminated_length": 269.46875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08570086862891912, + "epoch": 0.012, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.372472882270813, + "learning_rate": 2e-07, + "loss": 0.0024, + "num_tokens": 51790.0, + "reward": 3.6113057136535645, + "reward_std": 10.176191329956055, + "rewards/fitness_reward/mean": 2.3192968368530273, + "rewards/fitness_reward/std": 5.47819709777832, + "rewards/kidney_reward/mean": 0.7390017509460449, + "rewards/kidney_reward/std": 2.422557830810547, + "rewards/length2tails_reward/mean": 0.5406535863876343, + "rewards/length2tails_reward/std": 0.4137319326400757, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.39894187450408936, + "rewards/thermo_reward/std": 3.1979048252105713, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 268.59375, + "completions/mean_terminated_length": 268.59375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.07309244154021144, + "epoch": 0.014, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4794768989086151, + "learning_rate": 2.4e-07, + "loss": -0.002, + "num_tokens": 60417.0, + "reward": 4.083155632019043, + "reward_std": 9.34925365447998, + "rewards/fitness_reward/mean": 2.4409427642822266, + "rewards/fitness_reward/std": 5.35188627243042, + "rewards/kidney_reward/mean": 0.8888282775878906, + "rewards/kidney_reward/std": 1.9836840629577637, + "rewards/length2tails_reward/mean": 0.4503927230834961, + "rewards/length2tails_reward/std": 0.430117130279541, + "rewards/repeated_in_batch_reward/mean": 0.75, + "rewards/repeated_in_batch_reward/std": 0.4399413466453552, + "rewards/thermo_reward/mean": 0.6333456635475159, + "rewards/thermo_reward/std": 2.6188883781433105, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 260.3125, + "completions/mean_terminated_length": 260.3125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.09032700955867767, + "epoch": 0.016, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.46145445108413696, + "learning_rate": 2.8e-07, + "loss": -0.066, + "num_tokens": 68779.0, + "reward": 2.6856346130371094, + "reward_std": 11.17774772644043, + "rewards/fitness_reward/mean": 2.2029271125793457, + "rewards/fitness_reward/std": 5.6162285804748535, + "rewards/kidney_reward/mean": 0.2928307056427002, + "rewards/kidney_reward/std": 2.640321969985962, + "rewards/length2tails_reward/mean": 0.6909781694412231, + "rewards/length2tails_reward/std": 0.3927857577800751, + "rewards/repeated_in_batch_reward/mean": 0.875, + "rewards/repeated_in_batch_reward/std": 0.33601075410842896, + "rewards/thermo_reward/mean": 0.03327919542789459, + "rewards/thermo_reward/std": 3.2679364681243896, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 251.71875, + "completions/mean_terminated_length": 251.71875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.10597977228462696, + "epoch": 0.018, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4298640191555023, + "learning_rate": 3.2e-07, + "loss": -0.0897, + "num_tokens": 76866.0, + "reward": -1.6654566526412964, + "reward_std": 11.136141777038574, + "rewards/fitness_reward/mean": 0.48509520292282104, + "rewards/fitness_reward/std": 5.592866897583008, + "rewards/kidney_reward/mean": -0.9065221548080444, + "rewards/kidney_reward/std": 2.818354606628418, + "rewards/length2tails_reward/mean": 0.7829936742782593, + "rewards/length2tails_reward/std": 0.35669487714767456, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": -1.4160789251327515, + "rewards/thermo_reward/std": 3.1695234775543213, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.03125, + "completions/mean_terminated_length": 270.03125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08620737865567207, + "epoch": 0.02, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21064135432243347, + "learning_rate": 3.6e-07, + "loss": 0.0034, + "num_tokens": 85539.0, + "reward": 3.2720069885253906, + "reward_std": 10.456210136413574, + "rewards/fitness_reward/mean": 2.608745574951172, + "rewards/fitness_reward/std": 5.498191833496094, + "rewards/kidney_reward/mean": 0.5225638151168823, + "rewards/kidney_reward/std": 2.4604580402374268, + "rewards/length2tails_reward/mean": 0.5517613887786865, + "rewards/length2tails_reward/std": 0.44645097851753235, + "rewards/repeated_in_batch_reward/mean": 0.90625, + "rewards/repeated_in_batch_reward/std": 0.2961445748806, + "rewards/thermo_reward/mean": -0.005103394389152527, + "rewards/thermo_reward/std": 3.0479538440704346, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.125, + "completions/mean_terminated_length": 270.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08696451969444752, + "epoch": 0.022, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26834744215011597, + "learning_rate": 4e-07, + "loss": -0.001, + "num_tokens": 94215.0, + "reward": 6.2407965660095215, + "reward_std": 9.487542152404785, + "rewards/fitness_reward/mean": 3.9930999279022217, + "rewards/fitness_reward/std": 4.978099346160889, + "rewards/kidney_reward/mean": 0.9276078343391418, + "rewards/kidney_reward/std": 2.1969923973083496, + "rewards/length2tails_reward/mean": 0.5903832912445068, + "rewards/length2tails_reward/std": 0.4178674817085266, + "rewards/repeated_in_batch_reward/mean": 0.84375, + "rewards/repeated_in_batch_reward/std": 0.3689020276069641, + "rewards/thermo_reward/mean": 1.176675796508789, + "rewards/thermo_reward/std": 2.9462826251983643, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 268.15625, + "completions/mean_terminated_length": 268.15625, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "entropy": 0.09193634986877441, + "epoch": 0.024, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6354812979698181, + "learning_rate": 4.3999999999999997e-07, + "loss": 0.0025, + "num_tokens": 102828.0, + "reward": 0.7271066308021545, + "reward_std": 10.12745475769043, + "rewards/fitness_reward/mean": 1.473478078842163, + "rewards/fitness_reward/std": 5.440088272094727, + "rewards/kidney_reward/mean": -0.10944777727127075, + "rewards/kidney_reward/std": 2.518666982650757, + "rewards/length2tails_reward/mean": 0.49587365984916687, + "rewards/length2tails_reward/std": 0.4503480792045593, + "rewards/repeated_in_batch_reward/mean": 0.78125, + "rewards/repeated_in_batch_reward/std": 0.420013427734375, + "rewards/thermo_reward/mean": -0.7646359205245972, + "rewards/thermo_reward/std": 2.862889051437378, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 266.03125, + "completions/mean_terminated_length": 266.03125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.08739372063428164, + "epoch": 0.026, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2770998775959015, + "learning_rate": 4.8e-07, + "loss": -0.0301, + "num_tokens": 111373.0, + "reward": 0.3932121992111206, + "reward_std": 10.29038143157959, + "rewards/fitness_reward/mean": 1.5364488363265991, + "rewards/fitness_reward/std": 5.486446857452393, + "rewards/kidney_reward/mean": -0.43019402027130127, + "rewards/kidney_reward/std": 2.4975287914276123, + "rewards/length2tails_reward/mean": 0.6225026845932007, + "rewards/length2tails_reward/std": 0.44325339794158936, + "rewards/repeated_in_batch_reward/mean": 0.8125, + "rewards/repeated_in_batch_reward/std": 0.3965577781200409, + "rewards/thermo_reward/mean": -0.856542706489563, + "rewards/thermo_reward/std": 3.0668582916259766, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.0, + "completions/mean_terminated_length": 269.0, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.07466705655679107, + "epoch": 0.028, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.268663227558136, + "learning_rate": 5.2e-07, + "loss": -0.0038, + "num_tokens": 120013.0, + "reward": 3.62774658203125, + "reward_std": 9.2737455368042, + "rewards/fitness_reward/mean": 1.760581374168396, + "rewards/fitness_reward/std": 5.374361038208008, + "rewards/kidney_reward/mean": 0.8924310803413391, + "rewards/kidney_reward/std": 1.9723899364471436, + "rewards/length2tails_reward/mean": 0.43759018182754517, + "rewards/length2tails_reward/std": 0.46352171897888184, + "rewards/repeated_in_batch_reward/mean": 0.75, + "rewards/repeated_in_batch_reward/std": 0.4399413466453552, + "rewards/thermo_reward/mean": 0.8559751510620117, + "rewards/thermo_reward/std": 2.598565101623535, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.71875, + "completions/mean_terminated_length": 269.71875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.07777309278026223, + "epoch": 0.03, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1419842541217804, + "learning_rate": 5.6e-07, + "loss": 0.0002, + "num_tokens": 128676.0, + "reward": -0.15081048011779785, + "reward_std": 10.177221298217773, + "rewards/fitness_reward/mean": 0.7644028663635254, + "rewards/fitness_reward/std": 5.267439365386963, + "rewards/kidney_reward/mean": -0.33808475732803345, + "rewards/kidney_reward/std": 2.4396588802337646, + "rewards/length2tails_reward/mean": 0.5479411482810974, + "rewards/length2tails_reward/std": 0.4623284339904785, + "rewards/repeated_in_batch_reward/mean": 0.78125, + "rewards/repeated_in_batch_reward/std": 0.420013427734375, + "rewards/thermo_reward/mean": -0.7100476622581482, + "rewards/thermo_reward/std": 2.836381435394287, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 266.84375, + "completions/mean_terminated_length": 266.84375, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "entropy": 0.07267575850710273, + "epoch": 0.032, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11633159220218658, + "learning_rate": 6e-07, + "loss": -0.0115, + "num_tokens": 137247.0, + "reward": -1.5944565534591675, + "reward_std": 10.127143859863281, + "rewards/fitness_reward/mean": -0.03426043689250946, + "rewards/fitness_reward/std": 5.2975969314575195, + "rewards/kidney_reward/mean": -0.5960159301757812, + "rewards/kidney_reward/std": 2.679882526397705, + "rewards/length2tails_reward/mean": 0.5157021284103394, + "rewards/length2tails_reward/std": 0.45434051752090454, + "rewards/repeated_in_batch_reward/mean": 0.84375, + "rewards/repeated_in_batch_reward/std": 0.3689020276069641, + "rewards/thermo_reward/mean": -1.1001253128051758, + "rewards/thermo_reward/std": 2.988361120223999, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.15625, + "completions/mean_terminated_length": 269.15625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.07479206379503012, + "epoch": 0.034, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3478950262069702, + "learning_rate": 6.4e-07, + "loss": 0.0022, + "num_tokens": 145892.0, + "reward": 1.75685715675354, + "reward_std": 10.682146072387695, + "rewards/fitness_reward/mean": 1.3270858526229858, + "rewards/fitness_reward/std": 5.450982093811035, + "rewards/kidney_reward/mean": 0.19966863095760345, + "rewards/kidney_reward/std": 2.483642816543579, + "rewards/length2tails_reward/mean": 0.5242193341255188, + "rewards/length2tails_reward/std": 0.4437546133995056, + "rewards/repeated_in_batch_reward/mean": 0.78125, + "rewards/repeated_in_batch_reward/std": 0.420013427734375, + "rewards/thermo_reward/mean": 0.09955573081970215, + "rewards/thermo_reward/std": 3.0443100929260254, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.8125, + "completions/mean_terminated_length": 269.8125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.0834053922444582, + "epoch": 0.036, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32707831263542175, + "learning_rate": 6.800000000000001e-07, + "loss": 0.0012, + "num_tokens": 154558.0, + "reward": 4.515446186065674, + "reward_std": 9.859593391418457, + "rewards/fitness_reward/mean": 3.1498522758483887, + "rewards/fitness_reward/std": 5.432015895843506, + "rewards/kidney_reward/mean": 0.5560940504074097, + "rewards/kidney_reward/std": 2.2329390048980713, + "rewards/length2tails_reward/mean": 0.5659103393554688, + "rewards/length2tails_reward/std": 0.41648009419441223, + "rewards/repeated_in_batch_reward/mean": 0.90625, + "rewards/repeated_in_batch_reward/std": 0.2961445748806, + "rewards/thermo_reward/mean": 0.6622838973999023, + "rewards/thermo_reward/std": 3.0777931213378906, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.125, + "completions/mean_terminated_length": 270.125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08045468153432012, + "epoch": 0.038, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3290388584136963, + "learning_rate": 7.2e-07, + "loss": -0.0034, + "num_tokens": 163234.0, + "reward": 2.1541008949279785, + "reward_std": 9.83704662322998, + "rewards/fitness_reward/mean": 1.7646312713623047, + "rewards/fitness_reward/std": 5.3715972900390625, + "rewards/kidney_reward/mean": 0.3831119239330292, + "rewards/kidney_reward/std": 2.294931650161743, + "rewards/length2tails_reward/mean": 0.5611617565155029, + "rewards/length2tails_reward/std": 0.43626290559768677, + "rewards/repeated_in_batch_reward/mean": 0.71875, + "rewards/repeated_in_batch_reward/std": 0.45680341124534607, + "rewards/thermo_reward/mean": -0.1216331273317337, + "rewards/thermo_reward/std": 2.7697622776031494, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 266.0, + "completions/mean_terminated_length": 266.0, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "entropy": 0.09172683954238892, + "epoch": 0.04, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23171117901802063, + "learning_rate": 7.599999999999999e-07, + "loss": -0.0154, + "num_tokens": 171778.0, + "reward": 2.084986925125122, + "reward_std": 11.089282035827637, + "rewards/fitness_reward/mean": 2.350027084350586, + "rewards/fitness_reward/std": 5.590826988220215, + "rewards/kidney_reward/mean": 0.028891414403915405, + "rewards/kidney_reward/std": 2.7076759338378906, + "rewards/length2tails_reward/mean": 0.5892895460128784, + "rewards/length2tails_reward/std": 0.4344477653503418, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": -0.45286035537719727, + "rewards/thermo_reward/std": 3.284806728363037, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 264.34375, + "completions/mean_terminated_length": 264.34375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.07852034131065011, + "epoch": 0.042, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23757749795913696, + "learning_rate": 8e-07, + "loss": -0.0244, + "num_tokens": 180269.0, + "reward": -1.1749228239059448, + "reward_std": 10.359306335449219, + "rewards/fitness_reward/mean": 0.41968971490859985, + "rewards/fitness_reward/std": 5.393928527832031, + "rewards/kidney_reward/mean": -0.6090096235275269, + "rewards/kidney_reward/std": 2.580422878265381, + "rewards/length2tails_reward/mean": 0.6238871812820435, + "rewards/length2tails_reward/std": 0.4270743131637573, + "rewards/repeated_in_batch_reward/mean": 0.8125, + "rewards/repeated_in_batch_reward/std": 0.3965577781200409, + "rewards/thermo_reward/mean": -1.129241704940796, + "rewards/thermo_reward/std": 2.9064016342163086, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 269.03125, + "completions/mean_terminated_length": 269.03125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0786933982744813, + "epoch": 0.044, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5175756812095642, + "learning_rate": 8.399999999999999e-07, + "loss": 0.0011, + "num_tokens": 188910.0, + "reward": 3.9556326866149902, + "reward_std": 10.06831169128418, + "rewards/fitness_reward/mean": 2.6229472160339355, + "rewards/fitness_reward/std": 5.284468173980713, + "rewards/kidney_reward/mean": 0.5688485503196716, + "rewards/kidney_reward/std": 2.2570488452911377, + "rewards/length2tails_reward/mean": 0.5303610563278198, + "rewards/length2tails_reward/std": 0.42760950326919556, + "rewards/repeated_in_batch_reward/mean": 0.8125, + "rewards/repeated_in_batch_reward/std": 0.3965577781200409, + "rewards/thermo_reward/mean": 0.629551112651825, + "rewards/thermo_reward/std": 2.8939197063446045, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 264.125, + "completions/mean_terminated_length": 264.125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.08195907855406404, + "epoch": 0.046, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3343675434589386, + "learning_rate": 8.799999999999999e-07, + "loss": -0.0321, + "num_tokens": 197394.0, + "reward": 3.067450523376465, + "reward_std": 10.474081993103027, + "rewards/fitness_reward/mean": 2.238030195236206, + "rewards/fitness_reward/std": 5.569760799407959, + "rewards/kidney_reward/mean": 0.4576724171638489, + "rewards/kidney_reward/std": 2.436845064163208, + "rewards/length2tails_reward/mean": 0.5516926050186157, + "rewards/length2tails_reward/std": 0.43515467643737793, + "rewards/repeated_in_batch_reward/mean": 0.84375, + "rewards/repeated_in_batch_reward/std": 0.3689020276069641, + "rewards/thermo_reward/mean": 0.2322039157152176, + "rewards/thermo_reward/std": 3.0297348499298096, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 268.78125, + "completions/mean_terminated_length": 268.78125, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "entropy": 0.10035552131012082, + "epoch": 0.048, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3725874722003937, + "learning_rate": 9.2e-07, + "loss": -0.0084, + "num_tokens": 206027.0, + "reward": 2.082176685333252, + "reward_std": 11.04091739654541, + "rewards/fitness_reward/mean": 1.8071531057357788, + "rewards/fitness_reward/std": 5.459597587585449, + "rewards/kidney_reward/mean": -0.025432132184505463, + "rewards/kidney_reward/std": 2.6043825149536133, + "rewards/length2tails_reward/mean": 0.6623145937919617, + "rewards/length2tails_reward/std": 0.3805418014526367, + "rewards/repeated_in_batch_reward/mean": 0.84375, + "rewards/repeated_in_batch_reward/std": 0.3689020276069641, + "rewards/thermo_reward/mean": 0.14984972774982452, + "rewards/thermo_reward/std": 3.2950596809387207, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.6875, + "completions/mean_terminated_length": 270.6875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08541291346773505, + "epoch": 0.05, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20709945261478424, + "learning_rate": 9.6e-07, + "loss": 0.0025, + "num_tokens": 214721.0, + "reward": 2.1533591747283936, + "reward_std": 10.898603439331055, + "rewards/fitness_reward/mean": 1.8978402614593506, + "rewards/fitness_reward/std": 5.5759453773498535, + "rewards/kidney_reward/mean": 0.1307317316532135, + "rewards/kidney_reward/std": 2.487185001373291, + "rewards/length2tails_reward/mean": 0.6618248224258423, + "rewards/length2tails_reward/std": 0.4177837669849396, + "rewards/repeated_in_batch_reward/mean": 0.84375, + "rewards/repeated_in_batch_reward/std": 0.3689020276069641, + "rewards/thermo_reward/mean": -0.025770097970962524, + "rewards/thermo_reward/std": 3.1279430389404297, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 271.4375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.101377141661942, + "epoch": 0.052, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6182202100753784, + "learning_rate": 1e-06, + "loss": 0.0049, + "num_tokens": 223439.0, + "reward": -0.09726998209953308, + "reward_std": 10.948629379272461, + "rewards/fitness_reward/mean": 1.2619261741638184, + "rewards/fitness_reward/std": 5.739231109619141, + "rewards/kidney_reward/mean": -0.5961160063743591, + "rewards/kidney_reward/std": 2.6597819328308105, + "rewards/length2tails_reward/mean": 0.6376259326934814, + "rewards/length2tails_reward/std": 0.4443195164203644, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": -0.9205924272537231, + "rewards/thermo_reward/std": 3.2762088775634766, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.25, + "completions/mean_terminated_length": 270.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10240273922681808, + "epoch": 0.054, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13618984818458557, + "learning_rate": 1.04e-06, + "loss": 0.0074, + "num_tokens": 232119.0, + "reward": 2.9474754333496094, + "reward_std": 11.085765838623047, + "rewards/fitness_reward/mean": 2.818065643310547, + "rewards/fitness_reward/std": 5.607751846313477, + "rewards/kidney_reward/mean": 0.2121862769126892, + "rewards/kidney_reward/std": 2.740105152130127, + "rewards/length2tails_reward/mean": 0.6210638284683228, + "rewards/length2tails_reward/std": 0.393775999546051, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": -0.24488294124603271, + "rewards/thermo_reward/std": 3.3333635330200195, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 267.375, + "completions/mean_terminated_length": 267.375, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "entropy": 0.08450194029137492, + "epoch": 0.056, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5073533654212952, + "learning_rate": 1.08e-06, + "loss": -0.0081, + "num_tokens": 240707.0, + "reward": 4.150449752807617, + "reward_std": 10.743106842041016, + "rewards/fitness_reward/mean": 2.5777180194854736, + "rewards/fitness_reward/std": 5.342399597167969, + "rewards/kidney_reward/mean": 0.5591606497764587, + "rewards/kidney_reward/std": 2.441612482070923, + "rewards/length2tails_reward/mean": 0.5295137166976929, + "rewards/length2tails_reward/std": 0.4105660319328308, + "rewards/repeated_in_batch_reward/mean": 0.78125, + "rewards/repeated_in_batch_reward/std": 0.420013427734375, + "rewards/thermo_reward/mean": 0.8824952244758606, + "rewards/thermo_reward/std": 3.181992769241333, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.0625, + "completions/mean_terminated_length": 271.0625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08082179352641106, + "epoch": 0.058, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15366347134113312, + "learning_rate": 1.12e-06, + "loss": 0.0001, + "num_tokens": 249413.0, + "reward": 1.9728987216949463, + "reward_std": 10.83619499206543, + "rewards/fitness_reward/mean": 1.815950632095337, + "rewards/fitness_reward/std": 5.659666538238525, + "rewards/kidney_reward/mean": 0.1731961965560913, + "rewards/kidney_reward/std": 2.6282520294189453, + "rewards/length2tails_reward/mean": 0.6286097764968872, + "rewards/length2tails_reward/std": 0.43655481934547424, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": -0.17285895347595215, + "rewards/thermo_reward/std": 3.2424631118774414, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.96875, + "completions/mean_terminated_length": 270.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08776107709854841, + "epoch": 0.06, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11649761348962784, + "learning_rate": 1.16e-06, + "loss": 0.0009, + "num_tokens": 258116.0, + "reward": 5.3024468421936035, + "reward_std": 10.292346954345703, + "rewards/fitness_reward/mean": 3.8956284523010254, + "rewards/fitness_reward/std": 5.090051174163818, + "rewards/kidney_reward/mean": 0.7544621229171753, + "rewards/kidney_reward/std": 2.5224790573120117, + "rewards/length2tails_reward/mean": 0.685206413269043, + "rewards/length2tails_reward/std": 0.3567609190940857, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 0.4900863766670227, + "rewards/thermo_reward/std": 3.1310408115386963, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 269.9375, + "completions/mean_terminated_length": 269.9375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.07893102057278156, + "epoch": 0.062, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2900700271129608, + "learning_rate": 1.2e-06, + "loss": 0.005, + "num_tokens": 266786.0, + "reward": 1.427734375, + "reward_std": 11.001752853393555, + "rewards/fitness_reward/mean": 1.4691399335861206, + "rewards/fitness_reward/std": 5.541495323181152, + "rewards/kidney_reward/mean": -0.09434545040130615, + "rewards/kidney_reward/std": 2.562530994415283, + "rewards/length2tails_reward/mean": 0.6205853819847107, + "rewards/length2tails_reward/std": 0.4454227685928345, + "rewards/repeated_in_batch_reward/mean": 0.875, + "rewards/repeated_in_batch_reward/std": 0.33601075410842896, + "rewards/thermo_reward/mean": -0.09661871194839478, + "rewards/thermo_reward/std": 3.113922119140625, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09993747808039188, + "epoch": 0.064, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22942373156547546, + "learning_rate": 1.24e-06, + "loss": 0.001, + "num_tokens": 275502.0, + "reward": 2.7124903202056885, + "reward_std": 10.51668643951416, + "rewards/fitness_reward/mean": 2.0883593559265137, + "rewards/fitness_reward/std": 5.528704643249512, + "rewards/kidney_reward/mean": 0.3384060561656952, + "rewards/kidney_reward/std": 2.4939019680023193, + "rewards/length2tails_reward/mean": 0.6918207406997681, + "rewards/length2tails_reward/std": 0.4003857672214508, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 0.12279275059700012, + "rewards/thermo_reward/std": 3.189255714416504, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.25, + "completions/mean_terminated_length": 270.25, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08446257747709751, + "epoch": 0.066, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19864203035831451, + "learning_rate": 1.28e-06, + "loss": -0.0032, + "num_tokens": 284182.0, + "reward": 4.002676486968994, + "reward_std": 9.273280143737793, + "rewards/fitness_reward/mean": 2.842696189880371, + "rewards/fitness_reward/std": 5.155837535858154, + "rewards/kidney_reward/mean": 0.535129964351654, + "rewards/kidney_reward/std": 2.1154778003692627, + "rewards/length2tails_reward/mean": 0.5688945055007935, + "rewards/length2tails_reward/std": 0.4270206093788147, + "rewards/repeated_in_batch_reward/mean": 0.8125, + "rewards/repeated_in_batch_reward/std": 0.3965577781200409, + "rewards/thermo_reward/mean": 0.486710786819458, + "rewards/thermo_reward/std": 2.8715450763702393, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.09375, + "completions/mean_terminated_length": 270.09375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09240176528692245, + "epoch": 0.068, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17734283208847046, + "learning_rate": 1.32e-06, + "loss": -0.0013, + "num_tokens": 292857.0, + "reward": 7.405562400817871, + "reward_std": 8.667160034179688, + "rewards/fitness_reward/mean": 4.910820007324219, + "rewards/fitness_reward/std": 4.48259973526001, + "rewards/kidney_reward/mean": 1.2963453531265259, + "rewards/kidney_reward/std": 1.9723432064056396, + "rewards/length2tails_reward/mean": 0.5431360006332397, + "rewards/length2tails_reward/std": 0.43262580037117004, + "rewards/repeated_in_batch_reward/mean": 0.90625, + "rewards/repeated_in_batch_reward/std": 0.2961445748806, + "rewards/thermo_reward/mean": 1.0534589290618896, + "rewards/thermo_reward/std": 2.9402530193328857, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.34375, + "completions/mean_terminated_length": 270.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08174511976540089, + "epoch": 0.07, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1431119590997696, + "learning_rate": 1.3600000000000001e-06, + "loss": -0.0001, + "num_tokens": 301540.0, + "reward": 3.6283020973205566, + "reward_std": 10.12381362915039, + "rewards/fitness_reward/mean": 2.708242893218994, + "rewards/fitness_reward/std": 5.337825775146484, + "rewards/kidney_reward/mean": 0.5316824913024902, + "rewards/kidney_reward/std": 2.3590903282165527, + "rewards/length2tails_reward/mean": 0.5538583993911743, + "rewards/length2tails_reward/std": 0.434944212436676, + "rewards/repeated_in_batch_reward/mean": 0.90625, + "rewards/repeated_in_batch_reward/std": 0.2961445748806, + "rewards/thermo_reward/mean": 0.242366224527359, + "rewards/thermo_reward/std": 3.0170631408691406, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.8125, + "completions/mean_terminated_length": 269.8125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09009748417884111, + "epoch": 0.072, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33221614360809326, + "learning_rate": 1.4e-06, + "loss": 0.0038, + "num_tokens": 310206.0, + "reward": 3.3963706493377686, + "reward_std": 9.3759765625, + "rewards/fitness_reward/mean": 3.760885715484619, + "rewards/fitness_reward/std": 5.113700866699219, + "rewards/kidney_reward/mean": 0.23802819848060608, + "rewards/kidney_reward/std": 2.434286594390869, + "rewards/length2tails_reward/mean": 0.5747653245925903, + "rewards/length2tails_reward/std": 0.4042164981365204, + "rewards/repeated_in_batch_reward/mean": 0.90625, + "rewards/repeated_in_batch_reward/std": 0.2961445748806, + "rewards/thermo_reward/mean": -0.7506450414657593, + "rewards/thermo_reward/std": 2.9914844036102295, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.5, + "completions/mean_terminated_length": 269.5, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08821279183030128, + "epoch": 0.074, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1559227555990219, + "learning_rate": 1.44e-06, + "loss": 0.0016, + "num_tokens": 318862.0, + "reward": 4.784730911254883, + "reward_std": 10.130960464477539, + "rewards/fitness_reward/mean": 3.2076220512390137, + "rewards/fitness_reward/std": 5.36002779006958, + "rewards/kidney_reward/mean": 0.719211757183075, + "rewards/kidney_reward/std": 2.3217809200286865, + "rewards/length2tails_reward/mean": 0.5281630158424377, + "rewards/length2tails_reward/std": 0.40869244933128357, + "rewards/repeated_in_batch_reward/mean": 0.90625, + "rewards/repeated_in_batch_reward/std": 0.2961445748806, + "rewards/thermo_reward/mean": 0.7144560813903809, + "rewards/thermo_reward/std": 3.1187803745269775, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.5, + "completions/mean_terminated_length": 269.5, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.08450527861714363, + "epoch": 0.076, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18637605011463165, + "learning_rate": 1.48e-06, + "loss": -0.0027, + "num_tokens": 327518.0, + "reward": 6.34855842590332, + "reward_std": 8.797586441040039, + "rewards/fitness_reward/mean": 4.340523719787598, + "rewards/fitness_reward/std": 4.804779052734375, + "rewards/kidney_reward/mean": 1.185357689857483, + "rewards/kidney_reward/std": 2.0141444206237793, + "rewards/length2tails_reward/mean": 0.48803573846817017, + "rewards/length2tails_reward/std": 0.4352822005748749, + "rewards/repeated_in_batch_reward/mean": 0.875, + "rewards/repeated_in_batch_reward/std": 0.33601075410842896, + "rewards/thermo_reward/mean": 0.6863734722137451, + "rewards/thermo_reward/std": 2.7634387016296387, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.0625, + "completions/mean_terminated_length": 270.0625, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "entropy": 0.09006988536566496, + "epoch": 0.078, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2921997308731079, + "learning_rate": 1.5199999999999998e-06, + "loss": -0.0037, + "num_tokens": 336192.0, + "reward": 3.6054391860961914, + "reward_std": 10.492602348327637, + "rewards/fitness_reward/mean": 2.8815646171569824, + "rewards/fitness_reward/std": 5.417086124420166, + "rewards/kidney_reward/mean": 0.4850673973560333, + "rewards/kidney_reward/std": 2.5406293869018555, + "rewards/length2tails_reward/mean": 0.6245236396789551, + "rewards/length2tails_reward/std": 0.4255659580230713, + "rewards/repeated_in_batch_reward/mean": 0.875, + "rewards/repeated_in_batch_reward/std": 0.33601075410842896, + "rewards/thermo_reward/mean": 0.0888550728559494, + "rewards/thermo_reward/std": 3.0721518993377686, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.03125, + "completions/mean_terminated_length": 271.03125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09613204980269074, + "epoch": 0.08, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12304268777370453, + "learning_rate": 1.5599999999999999e-06, + "loss": 0.0001, + "num_tokens": 344897.0, + "reward": 2.7564549446105957, + "reward_std": 10.53861141204834, + "rewards/fitness_reward/mean": 2.2039315700531006, + "rewards/fitness_reward/std": 5.489523887634277, + "rewards/kidney_reward/mean": 0.2535344660282135, + "rewards/kidney_reward/std": 2.427860736846924, + "rewards/length2tails_reward/mean": 0.6793840527534485, + "rewards/length2tails_reward/std": 0.3961794376373291, + "rewards/repeated_in_batch_reward/mean": 0.875, + "rewards/repeated_in_batch_reward/std": 0.33601075410842896, + "rewards/thermo_reward/mean": 0.1435505896806717, + "rewards/thermo_reward/std": 3.074892520904541, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 268.96875, + "completions/mean_terminated_length": 268.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08931446820497513, + "epoch": 0.082, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1744069755077362, + "learning_rate": 1.6e-06, + "loss": -0.0007, + "num_tokens": 353536.0, + "reward": 5.175799369812012, + "reward_std": 8.847229957580566, + "rewards/fitness_reward/mean": 4.311430931091309, + "rewards/fitness_reward/std": 4.850402355194092, + "rewards/kidney_reward/mean": 0.6826827526092529, + "rewards/kidney_reward/std": 2.2333693504333496, + "rewards/length2tails_reward/mean": 0.4906601905822754, + "rewards/length2tails_reward/std": 0.4163333475589752, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 0.03886973857879639, + "rewards/thermo_reward/std": 3.012194871902466, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 264.375, + "completions/mean_terminated_length": 264.375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.08490060642361641, + "epoch": 0.084, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22794437408447266, + "learning_rate": 1.6399999999999998e-06, + "loss": -0.0285, + "num_tokens": 362028.0, + "reward": 2.1772820949554443, + "reward_std": 10.892127990722656, + "rewards/fitness_reward/mean": 1.9527804851531982, + "rewards/fitness_reward/std": 5.5872039794921875, + "rewards/kidney_reward/mean": 0.060886450111866, + "rewards/kidney_reward/std": 2.6227242946624756, + "rewards/length2tails_reward/mean": 0.5533137917518616, + "rewards/length2tails_reward/std": 0.44542330503463745, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 0.014533862471580505, + "rewards/thermo_reward/std": 3.3594391345977783, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 264.3125, + "completions/mean_terminated_length": 264.3125, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "entropy": 0.09379345551133156, + "epoch": 0.086, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19165834784507751, + "learning_rate": 1.6799999999999998e-06, + "loss": -0.0471, + "num_tokens": 370518.0, + "reward": 5.736239910125732, + "reward_std": 9.859665870666504, + "rewards/fitness_reward/mean": 4.196285247802734, + "rewards/fitness_reward/std": 5.0344109535217285, + "rewards/kidney_reward/mean": 1.027575969696045, + "rewards/kidney_reward/std": 2.3718574047088623, + "rewards/length2tails_reward/mean": 0.6502601504325867, + "rewards/length2tails_reward/std": 0.3797384202480316, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 0.3536030650138855, + "rewards/thermo_reward/std": 3.109739303588867, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.1875, + "completions/mean_terminated_length": 270.1875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09621737897396088, + "epoch": 0.088, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16425421833992004, + "learning_rate": 1.7199999999999998e-06, + "loss": 0.001, + "num_tokens": 379196.0, + "reward": 5.795779228210449, + "reward_std": 9.847657203674316, + "rewards/fitness_reward/mean": 3.5787415504455566, + "rewards/fitness_reward/std": 5.213191986083984, + "rewards/kidney_reward/mean": 1.1071183681488037, + "rewards/kidney_reward/std": 2.1389410495758057, + "rewards/length2tails_reward/mean": 0.6040310263633728, + "rewards/length2tails_reward/std": 0.4115493595600128, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 0.9557660818099976, + "rewards/thermo_reward/std": 3.0586154460906982, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 268.96875, + "completions/mean_terminated_length": 268.96875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.0871056062169373, + "epoch": 0.09, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24304160475730896, + "learning_rate": 1.7599999999999999e-06, + "loss": 0.0022, + "num_tokens": 387835.0, + "reward": 4.993832111358643, + "reward_std": 10.746659278869629, + "rewards/fitness_reward/mean": 3.420689582824707, + "rewards/fitness_reward/std": 5.437535285949707, + "rewards/kidney_reward/mean": 0.7717792987823486, + "rewards/kidney_reward/std": 2.535116672515869, + "rewards/length2tails_reward/mean": 0.5175907611846924, + "rewards/length2tails_reward/std": 0.45717287063598633, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.6496044397354126, + "rewards/thermo_reward/std": 3.1371889114379883, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.6875, + "completions/mean_terminated_length": 269.6875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0908602443523705, + "epoch": 0.092, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11827373504638672, + "learning_rate": 1.8e-06, + "loss": 0.0039, + "num_tokens": 396497.0, + "reward": 5.899564743041992, + "reward_std": 9.54543399810791, + "rewards/fitness_reward/mean": 4.488227844238281, + "rewards/fitness_reward/std": 4.842160701751709, + "rewards/kidney_reward/mean": 0.8022502660751343, + "rewards/kidney_reward/std": 2.440622091293335, + "rewards/length2tails_reward/mean": 0.566992461681366, + "rewards/length2tails_reward/std": 0.3695901930332184, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 0.4586372375488281, + "rewards/thermo_reward/std": 3.1700398921966553, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.28125, + "completions/mean_terminated_length": 271.28125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08991517964750528, + "epoch": 0.094, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07967487722635269, + "learning_rate": 1.84e-06, + "loss": 0.0012, + "num_tokens": 405210.0, + "reward": 7.124594211578369, + "reward_std": 9.315996170043945, + "rewards/fitness_reward/mean": 4.596066474914551, + "rewards/fitness_reward/std": 4.757828235626221, + "rewards/kidney_reward/mean": 1.2813934087753296, + "rewards/kidney_reward/std": 2.2047688961029053, + "rewards/length2tails_reward/mean": 0.6318175792694092, + "rewards/length2tails_reward/std": 0.4136396050453186, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 1.0902026891708374, + "rewards/thermo_reward/std": 2.822467565536499, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.125, + "completions/mean_terminated_length": 270.125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08989114267751575, + "epoch": 0.096, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0939304381608963, + "learning_rate": 1.8799999999999998e-06, + "loss": 0.0039, + "num_tokens": 413886.0, + "reward": 4.650942325592041, + "reward_std": 10.98830509185791, + "rewards/fitness_reward/mean": 3.210702419281006, + "rewards/fitness_reward/std": 5.458611011505127, + "rewards/kidney_reward/mean": 0.6795892119407654, + "rewards/kidney_reward/std": 2.532743453979492, + "rewards/length2tails_reward/mean": 0.5853027105331421, + "rewards/length2tails_reward/std": 0.42769482731819153, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.6021201610565186, + "rewards/thermo_reward/std": 3.22406005859375, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.34375, + "completions/mean_terminated_length": 270.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09917700570076704, + "epoch": 0.098, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15307842195034027, + "learning_rate": 1.92e-06, + "loss": 0.0077, + "num_tokens": 422569.0, + "reward": 5.508678436279297, + "reward_std": 10.777318954467773, + "rewards/fitness_reward/mean": 3.7935280799865723, + "rewards/fitness_reward/std": 5.3776984214782715, + "rewards/kidney_reward/mean": 0.8095374703407288, + "rewards/kidney_reward/std": 2.691157817840576, + "rewards/length2tails_reward/mean": 0.6158257722854614, + "rewards/length2tails_reward/std": 0.3776721954345703, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.7440307140350342, + "rewards/thermo_reward/std": 3.310176134109497, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.4375, + "completions/mean_terminated_length": 270.4375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08912370260804892, + "epoch": 0.1, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1141011118888855, + "learning_rate": 1.96e-06, + "loss": 0.0002, + "num_tokens": 431255.0, + "reward": 3.954058885574341, + "reward_std": 9.64480972290039, + "rewards/fitness_reward/mean": 2.8021559715270996, + "rewards/fitness_reward/std": 5.311481952667236, + "rewards/kidney_reward/mean": 0.5934000611305237, + "rewards/kidney_reward/std": 2.2488973140716553, + "rewards/length2tails_reward/mean": 0.5961021184921265, + "rewards/length2tails_reward/std": 0.4461689591407776, + "rewards/repeated_in_batch_reward/mean": 0.90625, + "rewards/repeated_in_batch_reward/std": 0.2961445748806, + "rewards/thermo_reward/mean": 0.4082678556442261, + "rewards/thermo_reward/std": 2.953021764755249, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.34375, + "completions/mean_terminated_length": 269.34375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.08916603773832321, + "epoch": 0.102, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14698363840579987, + "learning_rate": 2e-06, + "loss": -0.0057, + "num_tokens": 439906.0, + "reward": 7.744524002075195, + "reward_std": 7.743293285369873, + "rewards/fitness_reward/mean": 4.869905948638916, + "rewards/fitness_reward/std": 4.543521404266357, + "rewards/kidney_reward/mean": 1.560369849205017, + "rewards/kidney_reward/std": 1.8243423700332642, + "rewards/length2tails_reward/mean": 0.4840232729911804, + "rewards/length2tails_reward/std": 0.41251611709594727, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 1.1720962524414062, + "rewards/thermo_reward/std": 2.5674214363098145, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.75, + "completions/mean_terminated_length": 270.75, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09895084332674742, + "epoch": 0.104, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11244598031044006, + "learning_rate": 1.9999991778756307e-06, + "loss": 0.0014, + "num_tokens": 448602.0, + "reward": 8.26731014251709, + "reward_std": 9.016767501831055, + "rewards/fitness_reward/mean": 4.987358570098877, + "rewards/fitness_reward/std": 4.567765235900879, + "rewards/kidney_reward/mean": 1.553128957748413, + "rewards/kidney_reward/std": 2.018177032470703, + "rewards/length2tails_reward/mean": 0.6099748015403748, + "rewards/length2tails_reward/std": 0.41677555441856384, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.5658247470855713, + "rewards/thermo_reward/std": 2.8165760040283203, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 267.09375, + "completions/mean_terminated_length": 267.09375, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.09943242277950048, + "epoch": 0.106, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19205942749977112, + "learning_rate": 1.999996711503876e-06, + "loss": -0.0187, + "num_tokens": 457181.0, + "reward": 5.50713586807251, + "reward_std": 10.876879692077637, + "rewards/fitness_reward/mean": 3.5678153038024902, + "rewards/fitness_reward/std": 5.341129302978516, + "rewards/kidney_reward/mean": 0.8138814568519592, + "rewards/kidney_reward/std": 2.539766550064087, + "rewards/length2tails_reward/mean": 0.636076807975769, + "rewards/length2tails_reward/std": 0.4049949049949646, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.9618313908576965, + "rewards/thermo_reward/std": 3.2641184329986572, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.90625, + "completions/mean_terminated_length": 269.90625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09612957295030355, + "epoch": 0.108, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12145273387432098, + "learning_rate": 1.9999926008887906e-06, + "loss": 0.0016, + "num_tokens": 465850.0, + "reward": 5.88297176361084, + "reward_std": 9.687450408935547, + "rewards/fitness_reward/mean": 3.9562315940856934, + "rewards/fitness_reward/std": 5.032125473022461, + "rewards/kidney_reward/mean": 0.9145177602767944, + "rewards/kidney_reward/std": 2.250870943069458, + "rewards/length2tails_reward/mean": 0.5738779902458191, + "rewards/length2tails_reward/std": 0.4065060615539551, + "rewards/repeated_in_batch_reward/mean": 0.875, + "rewards/repeated_in_batch_reward/std": 0.33601075410842896, + "rewards/thermo_reward/mean": 0.867334246635437, + "rewards/thermo_reward/std": 2.963379383087158, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.53125, + "completions/mean_terminated_length": 272.53125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10116562247276306, + "epoch": 0.11, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07908673584461212, + "learning_rate": 1.999986846037133e-06, + "loss": 0.0, + "num_tokens": 474603.0, + "reward": 4.582418441772461, + "reward_std": 10.85931396484375, + "rewards/fitness_reward/mean": 3.5986382961273193, + "rewards/fitness_reward/std": 5.242656707763672, + "rewards/kidney_reward/mean": 0.5228749513626099, + "rewards/kidney_reward/std": 2.615586042404175, + "rewards/length2tails_reward/mean": 0.7722164988517761, + "rewards/length2tails_reward/std": 0.31869614124298096, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.28368380665779114, + "rewards/thermo_reward/std": 3.4354615211486816, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.3125, + "completions/mean_terminated_length": 271.3125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09424130246043205, + "epoch": 0.112, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07371854037046432, + "learning_rate": 1.9999794469583658e-06, + "loss": 0.0014, + "num_tokens": 483317.0, + "reward": 5.236574649810791, + "reward_std": 9.785390853881836, + "rewards/fitness_reward/mean": 3.810666799545288, + "rewards/fitness_reward/std": 5.2446608543396, + "rewards/kidney_reward/mean": 0.7944426536560059, + "rewards/kidney_reward/std": 2.3587422370910645, + "rewards/length2tails_reward/mean": 0.600432813167572, + "rewards/length2tails_reward/std": 0.43214908242225647, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.4714217782020569, + "rewards/thermo_reward/std": 3.0919981002807617, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 263.65625, + "completions/mean_terminated_length": 263.65625, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.09555036667734385, + "epoch": 0.114, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1235174760222435, + "learning_rate": 1.9999704036646555e-06, + "loss": -0.0246, + "num_tokens": 491786.0, + "reward": 2.1675093173980713, + "reward_std": 11.569999694824219, + "rewards/fitness_reward/mean": 2.065021514892578, + "rewards/fitness_reward/std": 5.637988090515137, + "rewards/kidney_reward/mean": 0.03666689991950989, + "rewards/kidney_reward/std": 2.760089635848999, + "rewards/length2tails_reward/mean": 0.616014838218689, + "rewards/length2tails_reward/std": 0.41217100620269775, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": -0.0895305871963501, + "rewards/thermo_reward/std": 3.400143623352051, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.0, + "completions/mean_terminated_length": 270.0, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1021167878061533, + "epoch": 0.116, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06497107446193695, + "learning_rate": 1.999959716170871e-06, + "loss": 0.0026, + "num_tokens": 500458.0, + "reward": 8.043519020080566, + "reward_std": 8.370131492614746, + "rewards/fitness_reward/mean": 5.48520565032959, + "rewards/fitness_reward/std": 4.132047176361084, + "rewards/kidney_reward/mean": 1.4747217893600464, + "rewards/kidney_reward/std": 2.0179266929626465, + "rewards/length2tails_reward/mean": 0.6013509035110474, + "rewards/length2tails_reward/std": 0.36498507857322693, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.9234564304351807, + "rewards/thermo_reward/std": 2.9822323322296143, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 270.625, + "completions/mean_terminated_length": 270.625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08857986517250538, + "epoch": 0.118, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09607218205928802, + "learning_rate": 1.999947384494585e-06, + "loss": 0.0064, + "num_tokens": 509150.0, + "reward": 1.3199851512908936, + "reward_std": 10.509997367858887, + "rewards/fitness_reward/mean": 2.571333885192871, + "rewards/fitness_reward/std": 5.572249889373779, + "rewards/kidney_reward/mean": -0.37929677963256836, + "rewards/kidney_reward/std": 2.616180658340454, + "rewards/length2tails_reward/mean": 0.6478626728057861, + "rewards/length2tails_reward/std": 0.4019405245780945, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": -1.0368380546569824, + "rewards/thermo_reward/std": 3.0118613243103027, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10882160067558289, + "epoch": 0.12, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13335488736629486, + "learning_rate": 1.9999334086560746e-06, + "loss": 0.0025, + "num_tokens": 517898.0, + "reward": 4.216526985168457, + "reward_std": 11.425372123718262, + "rewards/fitness_reward/mean": 3.1399548053741455, + "rewards/fitness_reward/std": 5.560117244720459, + "rewards/kidney_reward/mean": 0.4496838450431824, + "rewards/kidney_reward/std": 2.7316980361938477, + "rewards/length2tails_reward/mean": 0.739764392375946, + "rewards/length2tails_reward/std": 0.348965585231781, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.45291224122047424, + "rewards/thermo_reward/std": 3.5019266605377197, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.75, + "completions/mean_terminated_length": 270.75, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09633060963824391, + "epoch": 0.122, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09005337953567505, + "learning_rate": 1.999917788678319e-06, + "loss": 0.0008, + "num_tokens": 526594.0, + "reward": 5.742714881896973, + "reward_std": 9.801647186279297, + "rewards/fitness_reward/mean": 4.097263813018799, + "rewards/fitness_reward/std": 5.070271015167236, + "rewards/kidney_reward/mean": 0.6791552901268005, + "rewards/kidney_reward/std": 2.318540573120117, + "rewards/length2tails_reward/mean": 0.6282171607017517, + "rewards/length2tails_reward/std": 0.4059312045574188, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.8034740686416626, + "rewards/thermo_reward/std": 3.17315411567688, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.875, + "completions/mean_terminated_length": 269.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10056066047400236, + "epoch": 0.124, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09247541427612305, + "learning_rate": 1.9999005245870014e-06, + "loss": 0.0059, + "num_tokens": 535262.0, + "reward": 4.459540367126465, + "reward_std": 10.471243858337402, + "rewards/fitness_reward/mean": 3.3481173515319824, + "rewards/fitness_reward/std": 5.301581859588623, + "rewards/kidney_reward/mean": 0.5900368094444275, + "rewards/kidney_reward/std": 2.4857983589172363, + "rewards/length2tails_reward/mean": 0.6027665138244629, + "rewards/length2tails_reward/std": 0.40925168991088867, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.3611098527908325, + "rewards/thermo_reward/std": 3.0389420986175537, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.75, + "completions/mean_terminated_length": 270.75, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09483396168798208, + "epoch": 0.126, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10048498958349228, + "learning_rate": 1.9998816164105085e-06, + "loss": 0.0018, + "num_tokens": 543958.0, + "reward": 5.394386291503906, + "reward_std": 9.99219036102295, + "rewards/fitness_reward/mean": 4.284843921661377, + "rewards/fitness_reward/std": 4.95587682723999, + "rewards/kidney_reward/mean": 0.7851080894470215, + "rewards/kidney_reward/std": 2.488471031188965, + "rewards/length2tails_reward/mean": 0.644648551940918, + "rewards/length2tails_reward/std": 0.3986409604549408, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.15996931493282318, + "rewards/thermo_reward/std": 3.202000617980957, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.15625, + "completions/mean_terminated_length": 270.15625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10331257060170174, + "epoch": 0.128, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06592488288879395, + "learning_rate": 1.99986106417993e-06, + "loss": -0.0012, + "num_tokens": 552635.0, + "reward": 8.70488166809082, + "reward_std": 7.780556678771973, + "rewards/fitness_reward/mean": 5.549319267272949, + "rewards/fitness_reward/std": 4.140100479125977, + "rewards/kidney_reward/mean": 1.7368488311767578, + "rewards/kidney_reward/std": 1.8308501243591309, + "rewards/length2tails_reward/mean": 0.5674126148223877, + "rewards/length2tails_reward/std": 0.38003429770469666, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.2619726657867432, + "rewards/thermo_reward/std": 2.896923542022705, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.4375, + "completions/mean_terminated_length": 269.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09276031795889139, + "epoch": 0.13, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07289103418588638, + "learning_rate": 1.999838867929058e-06, + "loss": 0.0088, + "num_tokens": 561289.0, + "reward": 6.259957313537598, + "reward_std": 9.348736763000488, + "rewards/fitness_reward/mean": 4.764282703399658, + "rewards/fitness_reward/std": 4.725064277648926, + "rewards/kidney_reward/mean": 0.948445737361908, + "rewards/kidney_reward/std": 2.2839484214782715, + "rewards/length2tails_reward/mean": 0.5402899980545044, + "rewards/length2tails_reward/std": 0.4097994267940521, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.39319971203804016, + "rewards/thermo_reward/std": 3.061786651611328, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.0625, + "completions/mean_terminated_length": 271.0625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10293789114803076, + "epoch": 0.132, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07653038948774338, + "learning_rate": 1.99981502769439e-06, + "loss": 0.0026, + "num_tokens": 569995.0, + "reward": 3.9891343116760254, + "reward_std": 11.183003425598145, + "rewards/fitness_reward/mean": 3.2974727153778076, + "rewards/fitness_reward/std": 5.498640537261963, + "rewards/kidney_reward/mean": 0.4626770615577698, + "rewards/kidney_reward/std": 2.654911518096924, + "rewards/length2tails_reward/mean": 0.7131531834602356, + "rewards/length2tails_reward/std": 0.3393803834915161, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.05766934156417847, + "rewards/thermo_reward/std": 3.530383825302124, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 269.96875, + "completions/mean_terminated_length": 269.96875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.0999839473515749, + "epoch": 0.134, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11364952474832535, + "learning_rate": 1.9997895435151245e-06, + "loss": 0.0032, + "num_tokens": 578666.0, + "reward": 3.903465747833252, + "reward_std": 10.257515907287598, + "rewards/fitness_reward/mean": 2.9853196144104004, + "rewards/fitness_reward/std": 5.393058776855469, + "rewards/kidney_reward/mean": 0.5317540168762207, + "rewards/kidney_reward/std": 2.351651906967163, + "rewards/length2tails_reward/mean": 0.600960373878479, + "rewards/length2tails_reward/std": 0.42705038189888, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 0.23254606127738953, + "rewards/thermo_reward/std": 2.9828085899353027, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.03125, + "completions/mean_terminated_length": 269.03125, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.10749497450888157, + "epoch": 0.136, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21903333067893982, + "learning_rate": 1.999762415433164e-06, + "loss": -0.0117, + "num_tokens": 587307.0, + "reward": 6.335038185119629, + "reward_std": 9.666593551635742, + "rewards/fitness_reward/mean": 4.830661773681641, + "rewards/fitness_reward/std": 4.752323150634766, + "rewards/kidney_reward/mean": 0.9589738845825195, + "rewards/kidney_reward/std": 2.4108433723449707, + "rewards/length2tails_reward/mean": 0.5960825085639954, + "rewards/length2tails_reward/std": 0.39040085673332214, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.38579437136650085, + "rewards/thermo_reward/std": 3.2222890853881836, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.8125, + "completions/mean_terminated_length": 270.8125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09495829651132226, + "epoch": 0.138, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1579008847475052, + "learning_rate": 1.9997336434931136e-06, + "loss": -0.0066, + "num_tokens": 596005.0, + "reward": 6.201303958892822, + "reward_std": 8.618098258972168, + "rewards/fitness_reward/mean": 4.139523506164551, + "rewards/fitness_reward/std": 4.9184184074401855, + "rewards/kidney_reward/mean": 1.0352199077606201, + "rewards/kidney_reward/std": 2.0201306343078613, + "rewards/length2tails_reward/mean": 0.5589442253112793, + "rewards/length2tails_reward/std": 0.4349207878112793, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 0.876916229724884, + "rewards/thermo_reward/std": 2.810779333114624, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.34375, + "completions/mean_terminated_length": 270.34375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.09140817727893591, + "epoch": 0.14, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08873090893030167, + "learning_rate": 1.9997032277422817e-06, + "loss": 0.0014, + "num_tokens": 604688.0, + "reward": 4.505502700805664, + "reward_std": 9.884005546569824, + "rewards/fitness_reward/mean": 3.5307188034057617, + "rewards/fitness_reward/std": 5.276813507080078, + "rewards/kidney_reward/mean": 0.6191385984420776, + "rewards/kidney_reward/std": 2.336310863494873, + "rewards/length2tails_reward/mean": 0.6345251202583313, + "rewards/length2tails_reward/std": 0.406307190656662, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.19219228625297546, + "rewards/thermo_reward/std": 2.8584790229797363, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 268.125, + "completions/mean_terminated_length": 268.125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09404969774186611, + "epoch": 0.142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07119587808847427, + "learning_rate": 1.99967116823068e-06, + "loss": 0.0016, + "num_tokens": 613300.0, + "reward": 8.309608459472656, + "reward_std": 7.589566707611084, + "rewards/fitness_reward/mean": 5.349855422973633, + "rewards/fitness_reward/std": 4.035140514373779, + "rewards/kidney_reward/mean": 1.5614449977874756, + "rewards/kidney_reward/std": 1.8345295190811157, + "rewards/length2tails_reward/mean": 0.4056016802787781, + "rewards/length2tails_reward/std": 0.4158850312232971, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.2577483654022217, + "rewards/thermo_reward/std": 2.606032371520996, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.375, + "completions/mean_terminated_length": 269.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09486975288018584, + "epoch": 0.144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11193745583295822, + "learning_rate": 1.999637465011021e-06, + "loss": 0.003, + "num_tokens": 621952.0, + "reward": 5.321290969848633, + "reward_std": 8.887402534484863, + "rewards/fitness_reward/mean": 4.612102508544922, + "rewards/fitness_reward/std": 4.851127624511719, + "rewards/kidney_reward/mean": 0.815106213092804, + "rewards/kidney_reward/std": 2.2408759593963623, + "rewards/length2tails_reward/mean": 0.49875015020370483, + "rewards/length2tails_reward/std": 0.42784377932548523, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": -0.2557927072048187, + "rewards/thermo_reward/std": 2.849024534225464, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.10298145096749067, + "epoch": 0.146, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09242729842662811, + "learning_rate": 1.999602118138722e-06, + "loss": -0.0, + "num_tokens": 630673.0, + "reward": 5.481384754180908, + "reward_std": 9.99774169921875, + "rewards/fitness_reward/mean": 3.745878219604492, + "rewards/fitness_reward/std": 5.336620807647705, + "rewards/kidney_reward/mean": 0.8958523869514465, + "rewards/kidney_reward/std": 2.3274919986724854, + "rewards/length2tails_reward/mean": 0.7147248983383179, + "rewards/length2tails_reward/std": 0.33806055784225464, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.6681815981864929, + "rewards/thermo_reward/std": 3.090961217880249, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.15625, + "completions/mean_terminated_length": 270.15625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08604165725409985, + "epoch": 0.148, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10083026438951492, + "learning_rate": 1.9995651276719022e-06, + "loss": 0.0028, + "num_tokens": 639350.0, + "reward": 5.352714538574219, + "reward_std": 9.972210884094238, + "rewards/fitness_reward/mean": 3.942040205001831, + "rewards/fitness_reward/std": 5.167749404907227, + "rewards/kidney_reward/mean": 0.5562193393707275, + "rewards/kidney_reward/std": 2.3640551567077637, + "rewards/length2tails_reward/mean": 0.5871860980987549, + "rewards/length2tails_reward/std": 0.436727374792099, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 0.7019863128662109, + "rewards/thermo_reward/std": 3.129981517791748, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11275890469551086, + "epoch": 0.15, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08880766481161118, + "learning_rate": 1.999526493671383e-06, + "loss": 0.002, + "num_tokens": 648085.0, + "reward": 4.680360794067383, + "reward_std": 11.522732734680176, + "rewards/fitness_reward/mean": 3.4654369354248047, + "rewards/fitness_reward/std": 5.4723615646362305, + "rewards/kidney_reward/mean": 0.5316477417945862, + "rewards/kidney_reward/std": 2.7512929439544678, + "rewards/length2tails_reward/mean": 0.7533819675445557, + "rewards/length2tails_reward/std": 0.35159409046173096, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.5079379081726074, + "rewards/thermo_reward/std": 3.539987087249756, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.03125, + "completions/mean_terminated_length": 270.03125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0990257365629077, + "epoch": 0.152, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1063062772154808, + "learning_rate": 1.999486216200688e-06, + "loss": 0.0038, + "num_tokens": 656758.0, + "reward": 6.651132106781006, + "reward_std": 8.796796798706055, + "rewards/fitness_reward/mean": 4.565807342529297, + "rewards/fitness_reward/std": 4.806469917297363, + "rewards/kidney_reward/mean": 1.3196901082992554, + "rewards/kidney_reward/std": 2.0117340087890625, + "rewards/length2tails_reward/mean": 0.5656614303588867, + "rewards/length2tails_reward/std": 0.38031989336013794, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 0.6153182983398438, + "rewards/thermo_reward/std": 2.7939960956573486, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.96875, + "completions/mean_terminated_length": 270.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1029750956222415, + "epoch": 0.154, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08091334998607635, + "learning_rate": 1.999444295326043e-06, + "loss": 0.0002, + "num_tokens": 665461.0, + "reward": 7.837647438049316, + "reward_std": 8.722521781921387, + "rewards/fitness_reward/mean": 5.028226375579834, + "rewards/fitness_reward/std": 4.435754776000977, + "rewards/kidney_reward/mean": 1.4896342754364014, + "rewards/kidney_reward/std": 1.9922144412994385, + "rewards/length2tails_reward/mean": 0.6519697904586792, + "rewards/length2tails_reward/std": 0.3789510726928711, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.1545900106430054, + "rewards/thermo_reward/std": 2.942502737045288, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.3125, + "completions/mean_terminated_length": 270.3125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0930614871904254, + "epoch": 0.156, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10220735520124435, + "learning_rate": 1.9994007311163777e-06, + "loss": 0.0018, + "num_tokens": 674143.0, + "reward": 5.842037677764893, + "reward_std": 9.24101734161377, + "rewards/fitness_reward/mean": 4.47142219543457, + "rewards/fitness_reward/std": 4.9697465896606445, + "rewards/kidney_reward/mean": 1.0660680532455444, + "rewards/kidney_reward/std": 2.2489302158355713, + "rewards/length2tails_reward/mean": 0.56805419921875, + "rewards/length2tails_reward/std": 0.416465163230896, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.14774230122566223, + "rewards/thermo_reward/std": 3.289302349090576, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.875, + "completions/mean_terminated_length": 269.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09572804998606443, + "epoch": 0.158, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07806969434022903, + "learning_rate": 1.999355523643321e-06, + "loss": 0.0031, + "num_tokens": 682811.0, + "reward": 4.878722667694092, + "reward_std": 10.568361282348633, + "rewards/fitness_reward/mean": 3.5764052867889404, + "rewards/fitness_reward/std": 5.329973220825195, + "rewards/kidney_reward/mean": 0.6331483125686646, + "rewards/kidney_reward/std": 2.4987599849700928, + "rewards/length2tails_reward/mean": 0.5859672427177429, + "rewards/length2tails_reward/std": 0.4179089665412903, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 0.5168222188949585, + "rewards/thermo_reward/std": 3.341632127761841, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.1875, + "completions/mean_terminated_length": 269.1875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1038662726059556, + "epoch": 0.16, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10314842313528061, + "learning_rate": 1.9993086729812065e-06, + "loss": 0.0029, + "num_tokens": 691457.0, + "reward": 7.508264541625977, + "reward_std": 8.252596855163574, + "rewards/fitness_reward/mean": 5.147569179534912, + "rewards/fitness_reward/std": 4.439197063446045, + "rewards/kidney_reward/mean": 1.4739444255828857, + "rewards/kidney_reward/std": 1.952415943145752, + "rewards/length2tails_reward/mean": 0.48841190338134766, + "rewards/length2tails_reward/std": 0.42245423793792725, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.7379104495048523, + "rewards/thermo_reward/std": 2.9669361114501953, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 271.21875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09912833292037249, + "epoch": 0.162, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09835723787546158, + "learning_rate": 1.9992601792070675e-06, + "loss": 0.0019, + "num_tokens": 700168.0, + "reward": 5.911773681640625, + "reward_std": 10.059379577636719, + "rewards/fitness_reward/mean": 3.8010964393615723, + "rewards/fitness_reward/std": 5.367618083953857, + "rewards/kidney_reward/mean": 1.019120454788208, + "rewards/kidney_reward/std": 2.3711156845092773, + "rewards/length2tails_reward/mean": 0.6711513996124268, + "rewards/length2tails_reward/std": 0.38624030351638794, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.9244422912597656, + "rewards/thermo_reward/std": 3.0485928058624268, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 262.3125, + "completions/mean_terminated_length": 262.3125, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "entropy": 0.09261684585362673, + "epoch": 0.164, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.302977591753006, + "learning_rate": 1.9992100424006403e-06, + "loss": -0.0424, + "num_tokens": 708594.0, + "reward": 5.575660705566406, + "reward_std": 9.62802505493164, + "rewards/fitness_reward/mean": 4.478847503662109, + "rewards/fitness_reward/std": 4.956948280334473, + "rewards/kidney_reward/mean": 0.8628518581390381, + "rewards/kidney_reward/std": 2.399749994277954, + "rewards/length2tails_reward/mean": 0.499155730009079, + "rewards/length2tails_reward/std": 0.4330158531665802, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.08404579013586044, + "rewards/thermo_reward/std": 3.165400981903076, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.125, + "completions/mean_terminated_length": 273.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11235030833631754, + "epoch": 0.166, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1042870506644249, + "learning_rate": 1.9991582626443615e-06, + "loss": -0.0035, + "num_tokens": 717366.0, + "reward": 5.495595932006836, + "reward_std": 9.901491165161133, + "rewards/fitness_reward/mean": 4.186452865600586, + "rewards/fitness_reward/std": 5.04417085647583, + "rewards/kidney_reward/mean": 0.7878950834274292, + "rewards/kidney_reward/std": 2.339024066925049, + "rewards/length2tails_reward/mean": 0.8301782608032227, + "rewards/length2tails_reward/std": 0.2957022190093994, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.3382304012775421, + "rewards/thermo_reward/std": 3.0513925552368164, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.625, + "completions/mean_terminated_length": 270.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10217729490250349, + "epoch": 0.168, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21394874155521393, + "learning_rate": 1.999104840023371e-06, + "loss": -0.0013, + "num_tokens": 726058.0, + "reward": 8.809710502624512, + "reward_std": 7.709327697753906, + "rewards/fitness_reward/mean": 5.823422431945801, + "rewards/fitness_reward/std": 3.6806018352508545, + "rewards/kidney_reward/mean": 1.486276388168335, + "rewards/kidney_reward/std": 1.9127298593521118, + "rewards/length2tails_reward/mean": 0.6566610336303711, + "rewards/length2tails_reward/std": 0.3470841646194458, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.3343451023101807, + "rewards/thermo_reward/std": 2.83933162689209, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.3125, + "completions/mean_terminated_length": 271.3125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09568210691213608, + "epoch": 0.17, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10421138256788254, + "learning_rate": 1.999049774625508e-06, + "loss": -0.0004, + "num_tokens": 734772.0, + "reward": 7.292887210845947, + "reward_std": 9.549410820007324, + "rewards/fitness_reward/mean": 4.795578479766846, + "rewards/fitness_reward/std": 4.69606876373291, + "rewards/kidney_reward/mean": 1.1542949676513672, + "rewards/kidney_reward/std": 2.3051979541778564, + "rewards/length2tails_reward/mean": 0.6634117364883423, + "rewards/length2tails_reward/std": 0.41317665576934814, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.1766724586486816, + "rewards/thermo_reward/std": 2.9498138427734375, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.71875, + "completions/mean_terminated_length": 269.71875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09443402104079723, + "epoch": 0.172, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10363943129777908, + "learning_rate": 1.9989930665413145e-06, + "loss": 0.0024, + "num_tokens": 743435.0, + "reward": 8.326192855834961, + "reward_std": 8.09758472442627, + "rewards/fitness_reward/mean": 5.624570846557617, + "rewards/fitness_reward/std": 4.108580112457275, + "rewards/kidney_reward/mean": 1.6422455310821533, + "rewards/kidney_reward/std": 1.912664771080017, + "rewards/length2tails_reward/mean": 0.5016780495643616, + "rewards/length2tails_reward/std": 0.43905147910118103, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.9092094898223877, + "rewards/thermo_reward/std": 2.8582024574279785, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.15625, + "completions/mean_terminated_length": 270.15625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.095752764493227, + "epoch": 0.174, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07105634361505508, + "learning_rate": 1.9989347158640323e-06, + "loss": 0.0005, + "num_tokens": 752112.0, + "reward": 6.623435020446777, + "reward_std": 9.345142364501953, + "rewards/fitness_reward/mean": 4.6255388259887695, + "rewards/fitness_reward/std": 4.729114532470703, + "rewards/kidney_reward/mean": 0.9081051349639893, + "rewards/kidney_reward/std": 2.3178250789642334, + "rewards/length2tails_reward/mean": 0.6243544816970825, + "rewards/length2tails_reward/std": 0.39751023054122925, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.9273550510406494, + "rewards/thermo_reward/std": 3.172642469406128, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.34375, + "completions/mean_terminated_length": 270.34375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09468840854242444, + "epoch": 0.176, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10832744091749191, + "learning_rate": 1.998874722689604e-06, + "loss": 0.0005, + "num_tokens": 760795.0, + "reward": 5.828799724578857, + "reward_std": 10.804570198059082, + "rewards/fitness_reward/mean": 3.6554083824157715, + "rewards/fitness_reward/std": 5.36220645904541, + "rewards/kidney_reward/mean": 0.8743642568588257, + "rewards/kidney_reward/std": 2.57539701461792, + "rewards/length2tails_reward/mean": 0.5887205600738525, + "rewards/length2tails_reward/std": 0.4141828119754791, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.14015531539917, + "rewards/thermo_reward/std": 3.175602674484253, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10456154402345419, + "epoch": 0.178, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07037483900785446, + "learning_rate": 1.9988130871166737e-06, + "loss": 0.0016, + "num_tokens": 769516.0, + "reward": 5.9289679527282715, + "reward_std": 9.64340877532959, + "rewards/fitness_reward/mean": 4.207990646362305, + "rewards/fitness_reward/std": 5.013721466064453, + "rewards/kidney_reward/mean": 0.8479658365249634, + "rewards/kidney_reward/std": 2.3798933029174805, + "rewards/length2tails_reward/mean": 0.6725878119468689, + "rewards/length2tails_reward/std": 0.40477490425109863, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.7057527899742126, + "rewards/thermo_reward/std": 3.074697494506836, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.53125, + "completions/mean_terminated_length": 269.53125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09975282941013575, + "epoch": 0.18, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12680593132972717, + "learning_rate": 1.9987498092465854e-06, + "loss": 0.0013, + "num_tokens": 778173.0, + "reward": 6.914088249206543, + "reward_std": 8.010117530822754, + "rewards/fitness_reward/mean": 4.84921932220459, + "rewards/fitness_reward/std": 4.466445446014404, + "rewards/kidney_reward/mean": 1.2208207845687866, + "rewards/kidney_reward/std": 1.968360185623169, + "rewards/length2tails_reward/mean": 0.5152795910835266, + "rewards/length2tails_reward/std": 0.41995134949684143, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 0.6987695097923279, + "rewards/thermo_reward/std": 2.7424521446228027, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.71875, + "completions/mean_terminated_length": 269.71875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09674192778766155, + "epoch": 0.182, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07509780675172806, + "learning_rate": 1.9986848891833845e-06, + "loss": 0.0024, + "num_tokens": 786836.0, + "reward": 8.015886306762695, + "reward_std": 8.986359596252441, + "rewards/fitness_reward/mean": 4.971525192260742, + "rewards/fitness_reward/std": 4.600165843963623, + "rewards/kidney_reward/mean": 1.5063060522079468, + "rewards/kidney_reward/std": 1.9875105619430542, + "rewards/length2tails_reward/mean": 0.5797788500785828, + "rewards/length2tails_reward/std": 0.37075045704841614, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.3800766468048096, + "rewards/thermo_reward/std": 2.7960541248321533, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.0625, + "completions/mean_terminated_length": 272.0625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10261530056595802, + "epoch": 0.184, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21921300888061523, + "learning_rate": 1.9986183270338147e-06, + "loss": 0.004, + "num_tokens": 795574.0, + "reward": 7.21077823638916, + "reward_std": 9.365437507629395, + "rewards/fitness_reward/mean": 4.806048393249512, + "rewards/fitness_reward/std": 4.652261257171631, + "rewards/kidney_reward/mean": 1.1478865146636963, + "rewards/kidney_reward/std": 2.238415241241455, + "rewards/length2tails_reward/mean": 0.6891014575958252, + "rewards/length2tails_reward/std": 0.36245107650756836, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.087932825088501, + "rewards/thermo_reward/std": 2.9496536254882812, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.375, + "completions/mean_terminated_length": 270.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10041815415024757, + "epoch": 0.186, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.061401546001434326, + "learning_rate": 1.998550122907321e-06, + "loss": 0.0003, + "num_tokens": 804258.0, + "reward": 8.16575813293457, + "reward_std": 8.563042640686035, + "rewards/fitness_reward/mean": 5.563236236572266, + "rewards/fitness_reward/std": 4.104979991912842, + "rewards/kidney_reward/mean": 1.351224660873413, + "rewards/kidney_reward/std": 2.202674627304077, + "rewards/length2tails_reward/mean": 0.6049997806549072, + "rewards/length2tails_reward/std": 0.3989236354827881, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.0907974243164062, + "rewards/thermo_reward/std": 3.0085599422454834, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.59375, + "completions/mean_terminated_length": 270.59375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10706782713532448, + "epoch": 0.188, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16338106989860535, + "learning_rate": 1.998480276916048e-06, + "loss": -0.0037, + "num_tokens": 812949.0, + "reward": 7.739925384521484, + "reward_std": 9.26419734954834, + "rewards/fitness_reward/mean": 4.661970138549805, + "rewards/fitness_reward/std": 4.768324851989746, + "rewards/kidney_reward/mean": 1.5202274322509766, + "rewards/kidney_reward/std": 2.156172037124634, + "rewards/length2tails_reward/mean": 0.6187995672225952, + "rewards/length2tails_reward/std": 0.3834105432033539, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.3958477973937988, + "rewards/thermo_reward/std": 2.8934850692749023, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 264.25, + "completions/mean_terminated_length": 264.25, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "entropy": 0.11393436696380377, + "epoch": 0.19, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27784472703933716, + "learning_rate": 1.99840878917484e-06, + "loss": -0.0679, + "num_tokens": 821437.0, + "reward": 9.508270263671875, + "reward_std": 8.273795127868652, + "rewards/fitness_reward/mean": 5.799304962158203, + "rewards/fitness_reward/std": 3.9007155895233154, + "rewards/kidney_reward/mean": 1.7468581199645996, + "rewards/kidney_reward/std": 2.057310104370117, + "rewards/length2tails_reward/mean": 0.6669470071792603, + "rewards/length2tails_reward/std": 0.3571520447731018, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.7954118251800537, + "rewards/thermo_reward/std": 2.665423631668091, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.59375, + "completions/mean_terminated_length": 269.59375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.08702652528882027, + "epoch": 0.192, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08435554802417755, + "learning_rate": 1.998335659801241e-06, + "loss": 0.0049, + "num_tokens": 830096.0, + "reward": 4.850047588348389, + "reward_std": 9.819940567016602, + "rewards/fitness_reward/mean": 3.855299949645996, + "rewards/fitness_reward/std": 5.291053295135498, + "rewards/kidney_reward/mean": 0.6495881080627441, + "rewards/kidney_reward/std": 2.2953221797943115, + "rewards/length2tails_reward/mean": 0.571954607963562, + "rewards/length2tails_reward/std": 0.40634799003601074, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.18796436488628387, + "rewards/thermo_reward/std": 3.090845823287964, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.375, + "completions/mean_terminated_length": 269.375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.08522860752418637, + "epoch": 0.194, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1303863376379013, + "learning_rate": 1.9982608889154927e-06, + "loss": 0.0034, + "num_tokens": 838748.0, + "reward": 4.834753036499023, + "reward_std": 10.436513900756836, + "rewards/fitness_reward/mean": 3.685192584991455, + "rewards/fitness_reward/std": 5.200677394866943, + "rewards/kidney_reward/mean": 0.5920010805130005, + "rewards/kidney_reward/std": 2.409742593765259, + "rewards/length2tails_reward/mean": 0.5136910676956177, + "rewards/length2tails_reward/std": 0.4063299596309662, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.4061906337738037, + "rewards/thermo_reward/std": 3.287951946258545, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 269.59375, + "completions/mean_terminated_length": 269.59375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09816279914230108, + "epoch": 0.196, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09529352188110352, + "learning_rate": 1.9981844766405384e-06, + "loss": 0.0006, + "num_tokens": 847407.0, + "reward": 6.227548599243164, + "reward_std": 8.884981155395508, + "rewards/fitness_reward/mean": 4.00059700012207, + "rewards/fitness_reward/std": 5.081515312194824, + "rewards/kidney_reward/mean": 1.2143152952194214, + "rewards/kidney_reward/std": 2.036752462387085, + "rewards/length2tails_reward/mean": 0.5433864593505859, + "rewards/length2tails_reward/std": 0.42335325479507446, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.8582972884178162, + "rewards/thermo_reward/std": 2.740398406982422, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10271472111344337, + "epoch": 0.198, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09906383603811264, + "learning_rate": 1.998106423102018e-06, + "loss": 0.0005, + "num_tokens": 856131.0, + "reward": 6.8593549728393555, + "reward_std": 10.270197868347168, + "rewards/fitness_reward/mean": 3.989349603652954, + "rewards/fitness_reward/std": 5.256290435791016, + "rewards/kidney_reward/mean": 1.1595712900161743, + "rewards/kidney_reward/std": 2.4321062564849854, + "rewards/length2tails_reward/mean": 0.6354026794433594, + "rewards/length2tails_reward/std": 0.4248734414577484, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.5468939542770386, + "rewards/thermo_reward/std": 3.099806070327759, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 270.65625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10207742638885975, + "epoch": 0.2, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09420901536941528, + "learning_rate": 1.9980267284282714e-06, + "loss": 0.0026, + "num_tokens": 864824.0, + "reward": 7.539562225341797, + "reward_std": 9.378880500793457, + "rewards/fitness_reward/mean": 5.152955055236816, + "rewards/fitness_reward/std": 4.533102035522461, + "rewards/kidney_reward/mean": 1.27552330493927, + "rewards/kidney_reward/std": 2.2748749256134033, + "rewards/length2tails_reward/mean": 0.6058512330055237, + "rewards/length2tails_reward/std": 0.38518333435058594, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.9504985809326172, + "rewards/thermo_reward/std": 3.0656003952026367, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.375, + "completions/mean_terminated_length": 270.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09604429174214602, + "epoch": 0.202, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07431904971599579, + "learning_rate": 1.9979453927503364e-06, + "loss": -0.0, + "num_tokens": 873508.0, + "reward": 8.550683975219727, + "reward_std": 8.27658748626709, + "rewards/fitness_reward/mean": 5.503734588623047, + "rewards/fitness_reward/std": 4.0965447425842285, + "rewards/kidney_reward/mean": 1.611420750617981, + "rewards/kidney_reward/std": 2.0140531063079834, + "rewards/length2tails_reward/mean": 0.5655707120895386, + "rewards/length2tails_reward/std": 0.413197785615921, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.2789708375930786, + "rewards/thermo_reward/std": 2.9564778804779053, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 265.5, + "completions/mean_terminated_length": 265.5, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.10650088731199503, + "epoch": 0.204, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22565796971321106, + "learning_rate": 1.9978624162019487e-06, + "loss": -0.0312, + "num_tokens": 882036.0, + "reward": 4.945562839508057, + "reward_std": 9.923328399658203, + "rewards/fitness_reward/mean": 3.839808464050293, + "rewards/fitness_reward/std": 5.21298360824585, + "rewards/kidney_reward/mean": 0.5858813524246216, + "rewards/kidney_reward/std": 2.3432159423828125, + "rewards/length2tails_reward/mean": 0.5365294218063354, + "rewards/length2tails_reward/std": 0.4102689325809479, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 0.3724702298641205, + "rewards/thermo_reward/std": 3.0979273319244385, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.3125, + "completions/mean_terminated_length": 269.3125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09324608091264963, + "epoch": 0.206, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07994762808084488, + "learning_rate": 1.9977777989195427e-06, + "loss": 0.0016, + "num_tokens": 890686.0, + "reward": 7.803867816925049, + "reward_std": 8.779563903808594, + "rewards/fitness_reward/mean": 4.832815170288086, + "rewards/fitness_reward/std": 4.733124256134033, + "rewards/kidney_reward/mean": 1.5026731491088867, + "rewards/kidney_reward/std": 1.9731096029281616, + "rewards/length2tails_reward/mean": 0.5229619741439819, + "rewards/length2tails_reward/std": 0.3595113158226013, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.3160828351974487, + "rewards/thermo_reward/std": 2.8773632049560547, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.3125, + "completions/mean_terminated_length": 270.3125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09143533185124397, + "epoch": 0.208, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15814386308193207, + "learning_rate": 1.99769154104225e-06, + "loss": -0.0006, + "num_tokens": 899368.0, + "reward": 6.416100978851318, + "reward_std": 10.01828670501709, + "rewards/fitness_reward/mean": 3.8951594829559326, + "rewards/fitness_reward/std": 5.236571788787842, + "rewards/kidney_reward/mean": 1.1687991619110107, + "rewards/kidney_reward/std": 2.2653310298919678, + "rewards/length2tails_reward/mean": 0.5847136974334717, + "rewards/length2tails_reward/std": 0.3893841505050659, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.1936709880828857, + "rewards/thermo_reward/std": 2.943660020828247, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.84375, + "completions/mean_terminated_length": 270.84375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10562636330723763, + "epoch": 0.21, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07695178687572479, + "learning_rate": 1.9976036427119007e-06, + "loss": -0.001, + "num_tokens": 908067.0, + "reward": 8.14999771118164, + "reward_std": 8.205320358276367, + "rewards/fitness_reward/mean": 5.517228126525879, + "rewards/fitness_reward/std": 4.090753078460693, + "rewards/kidney_reward/mean": 1.4921324253082275, + "rewards/kidney_reward/std": 2.0499107837677, + "rewards/length2tails_reward/mean": 0.61553955078125, + "rewards/length2tails_reward/std": 0.39413321018218994, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.9790826439857483, + "rewards/thermo_reward/std": 2.7510807514190674, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.25, + "completions/mean_terminated_length": 271.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10066411457955837, + "epoch": 0.212, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22739523649215698, + "learning_rate": 1.9975141040730204e-06, + "loss": 0.0031, + "num_tokens": 916779.0, + "reward": 4.442440509796143, + "reward_std": 10.857921600341797, + "rewards/fitness_reward/mean": 3.392305850982666, + "rewards/fitness_reward/std": 5.459863185882568, + "rewards/kidney_reward/mean": 0.45278146862983704, + "rewards/kidney_reward/std": 2.604245185852051, + "rewards/length2tails_reward/mean": 0.7072968482971191, + "rewards/length2tails_reward/std": 0.3663152754306793, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.426623672246933, + "rewards/thermo_reward/std": 3.39660382270813, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.09375, + "completions/mean_terminated_length": 270.09375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10727217141538858, + "epoch": 0.214, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11362366378307343, + "learning_rate": 1.997422925272834e-06, + "loss": 0.0006, + "num_tokens": 925454.0, + "reward": 7.628767013549805, + "reward_std": 8.385255813598633, + "rewards/fitness_reward/mean": 5.151345252990723, + "rewards/fitness_reward/std": 4.295953750610352, + "rewards/kidney_reward/mean": 1.4122976064682007, + "rewards/kidney_reward/std": 1.9665799140930176, + "rewards/length2tails_reward/mean": 0.5739939212799072, + "rewards/length2tails_reward/std": 0.38083434104919434, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.9077249765396118, + "rewards/thermo_reward/std": 2.839327096939087, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 266.6875, + "completions/mean_terminated_length": 266.6875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.09353059530258179, + "epoch": 0.216, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2610819637775421, + "learning_rate": 1.997330106461261e-06, + "loss": -0.028, + "num_tokens": 934020.0, + "reward": 5.741405010223389, + "reward_std": 10.462055206298828, + "rewards/fitness_reward/mean": 4.127673625946045, + "rewards/fitness_reward/std": 5.146501541137695, + "rewards/kidney_reward/mean": 0.7454001903533936, + "rewards/kidney_reward/std": 2.6007983684539795, + "rewards/length2tails_reward/mean": 0.605473518371582, + "rewards/length2tails_reward/std": 0.39351916313171387, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.7077840566635132, + "rewards/thermo_reward/std": 3.408210277557373, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.0, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 281.40625, + "completions/mean_terminated_length": 281.40625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10695363953709602, + "epoch": 0.218, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6681356430053711, + "learning_rate": 1.9972356477909203e-06, + "loss": 0.0724, + "num_tokens": 943057.0, + "reward": 6.142980575561523, + "reward_std": 10.004242897033691, + "rewards/fitness_reward/mean": 4.553394317626953, + "rewards/fitness_reward/std": 4.9470624923706055, + "rewards/kidney_reward/mean": 1.0444412231445312, + "rewards/kidney_reward/std": 2.3431780338287354, + "rewards/length2tails_reward/mean": 0.6724941730499268, + "rewards/length2tails_reward/std": 0.36398619413375854, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.37789565324783325, + "rewards/thermo_reward/std": 3.2467703819274902, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.25, + "completions/mean_terminated_length": 269.25, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.0894242450594902, + "epoch": 0.22, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08553007990121841, + "learning_rate": 1.997139549417124e-06, + "loss": 0.0058, + "num_tokens": 951705.0, + "reward": 7.994678020477295, + "reward_std": 8.695246696472168, + "rewards/fitness_reward/mean": 5.522009372711182, + "rewards/fitness_reward/std": 4.194775581359863, + "rewards/kidney_reward/mean": 1.1783056259155273, + "rewards/kidney_reward/std": 2.219749927520752, + "rewards/length2tails_reward/mean": 0.5040951371192932, + "rewards/length2tails_reward/std": 0.41517889499664307, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.1439536809921265, + "rewards/thermo_reward/std": 3.077824354171753, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.375, + "completions/mean_terminated_length": 269.375, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "entropy": 0.09392447862774134, + "epoch": 0.222, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2115517109632492, + "learning_rate": 1.9970418114978816e-06, + "loss": -0.003, + "num_tokens": 960357.0, + "reward": 6.93775749206543, + "reward_std": 9.360590934753418, + "rewards/fitness_reward/mean": 4.801181793212891, + "rewards/fitness_reward/std": 4.678520202636719, + "rewards/kidney_reward/mean": 1.1626392602920532, + "rewards/kidney_reward/std": 2.380502939224243, + "rewards/length2tails_reward/mean": 0.5976717472076416, + "rewards/length2tails_reward/std": 0.37266412377357483, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 0.82041996717453, + "rewards/thermo_reward/std": 2.931457042694092, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 276.625, + "completions/mean_terminated_length": 276.625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1262630559504032, + "epoch": 0.224, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7861679792404175, + "learning_rate": 1.9969424341938997e-06, + "loss": 0.0888, + "num_tokens": 969241.0, + "reward": 9.52402114868164, + "reward_std": 7.2843241691589355, + "rewards/fitness_reward/mean": 5.664241790771484, + "rewards/fitness_reward/std": 4.035696506500244, + "rewards/kidney_reward/mean": 1.9477050304412842, + "rewards/kidney_reward/std": 1.6191411018371582, + "rewards/length2tails_reward/mean": 0.4688212275505066, + "rewards/length2tails_reward/std": 0.39938884973526, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 1.7714415788650513, + "rewards/thermo_reward/std": 2.5693249702453613, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.9375, + "completions/mean_terminated_length": 271.9375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10180703271180391, + "epoch": 0.226, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10458873957395554, + "learning_rate": 1.9968414176685787e-06, + "loss": -0.0011, + "num_tokens": 977975.0, + "reward": 5.531233787536621, + "reward_std": 10.222626686096191, + "rewards/fitness_reward/mean": 3.81538724899292, + "rewards/fitness_reward/std": 5.117627143859863, + "rewards/kidney_reward/mean": 0.6674569845199585, + "rewards/kidney_reward/std": 2.389415740966797, + "rewards/length2tails_reward/mean": 0.6948447227478027, + "rewards/length2tails_reward/std": 0.3781306743621826, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.8789056539535522, + "rewards/thermo_reward/std": 3.083277463912964, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.90625, + "completions/mean_terminated_length": 269.90625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09896293561905622, + "epoch": 0.228, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0679410845041275, + "learning_rate": 1.9967387620880144e-06, + "loss": 0.0011, + "num_tokens": 986644.0, + "reward": 8.611519813537598, + "reward_std": 8.677281379699707, + "rewards/fitness_reward/mean": 5.582242012023926, + "rewards/fitness_reward/std": 4.200404167175293, + "rewards/kidney_reward/mean": 1.5517706871032715, + "rewards/kidney_reward/std": 2.0585439205169678, + "rewards/length2tails_reward/mean": 0.5672687292098999, + "rewards/length2tails_reward/std": 0.39586639404296875, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.3207803964614868, + "rewards/thermo_reward/std": 2.873594045639038, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 268.625, + "completions/mean_terminated_length": 268.625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09147355891764164, + "epoch": 0.23, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.121434785425663, + "learning_rate": 1.996634467620999e-06, + "loss": -0.0005, + "num_tokens": 995272.0, + "reward": 6.599301815032959, + "reward_std": 9.238264083862305, + "rewards/fitness_reward/mean": 4.2816853523254395, + "rewards/fitness_reward/std": 4.801488876342773, + "rewards/kidney_reward/mean": 1.1353163719177246, + "rewards/kidney_reward/std": 2.170203685760498, + "rewards/length2tails_reward/mean": 0.47178003191947937, + "rewards/length2tails_reward/std": 0.4033353328704834, + "rewards/repeated_in_batch_reward/mean": 0.875, + "rewards/repeated_in_batch_reward/std": 0.33601075410842896, + "rewards/thermo_reward/mean": 1.0476219654083252, + "rewards/thermo_reward/std": 2.811617612838745, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 265.3125, + "completions/mean_terminated_length": 265.3125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.12102366890758276, + "epoch": 0.232, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.42420950531959534, + "learning_rate": 1.9965285344390183e-06, + "loss": -0.0484, + "num_tokens": 1003794.0, + "reward": 6.1183061599731445, + "reward_std": 9.711209297180176, + "rewards/fitness_reward/mean": 4.594058036804199, + "rewards/fitness_reward/std": 4.714536190032959, + "rewards/kidney_reward/mean": 0.8576416373252869, + "rewards/kidney_reward/std": 2.442807674407959, + "rewards/length2tails_reward/mean": 0.6816294193267822, + "rewards/length2tails_reward/std": 0.38159894943237305, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.498443603515625, + "rewards/thermo_reward/std": 3.1422343254089355, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.90625, + "completions/mean_terminated_length": 269.90625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09542591776698828, + "epoch": 0.234, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17690907418727875, + "learning_rate": 1.9964209627162524e-06, + "loss": 0.0051, + "num_tokens": 1012463.0, + "reward": 6.564347743988037, + "reward_std": 9.992843627929688, + "rewards/fitness_reward/mean": 4.48134183883667, + "rewards/fitness_reward/std": 4.829487323760986, + "rewards/kidney_reward/mean": 0.9403144121170044, + "rewards/kidney_reward/std": 2.4145545959472656, + "rewards/length2tails_reward/mean": 0.5347142219543457, + "rewards/length2tails_reward/std": 0.41005927324295044, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.9892200827598572, + "rewards/thermo_reward/std": 3.2281882762908936, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.6875, + "completions/mean_terminated_length": 270.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09838891867548227, + "epoch": 0.236, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09169985353946686, + "learning_rate": 1.996311752629576e-06, + "loss": 0.0048, + "num_tokens": 1021157.0, + "reward": 7.520411491394043, + "reward_std": 9.169465065002441, + "rewards/fitness_reward/mean": 5.126192092895508, + "rewards/fitness_reward/std": 4.452926158905029, + "rewards/kidney_reward/mean": 1.2386646270751953, + "rewards/kidney_reward/std": 2.297891855239868, + "rewards/length2tails_reward/mean": 0.6078680753707886, + "rewards/length2tails_reward/std": 0.39115339517593384, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.9947676062583923, + "rewards/thermo_reward/std": 2.9921579360961914, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 268.84375, + "completions/mean_terminated_length": 268.84375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09704151190817356, + "epoch": 0.238, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0968693345785141, + "learning_rate": 1.9962009043585583e-06, + "loss": 0.0018, + "num_tokens": 1029792.0, + "reward": 7.006888389587402, + "reward_std": 8.42821979522705, + "rewards/fitness_reward/mean": 5.212890148162842, + "rewards/fitness_reward/std": 4.411964416503906, + "rewards/kidney_reward/mean": 1.0951461791992188, + "rewards/kidney_reward/std": 2.16939640045166, + "rewards/length2tails_reward/mean": 0.4964791536331177, + "rewards/length2tails_reward/std": 0.36847445368766785, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.5492041707038879, + "rewards/thermo_reward/std": 3.0719711780548096, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.625, + "completions/mean_terminated_length": 269.625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10862219892442226, + "epoch": 0.24, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08731769770383835, + "learning_rate": 1.9960884180854605e-06, + "loss": 0.0026, + "num_tokens": 1038452.0, + "reward": 7.580234050750732, + "reward_std": 8.95448112487793, + "rewards/fitness_reward/mean": 5.328237533569336, + "rewards/fitness_reward/std": 4.111377239227295, + "rewards/kidney_reward/mean": 1.0963175296783447, + "rewards/kidney_reward/std": 2.2142772674560547, + "rewards/length2tails_reward/mean": 0.5319727063179016, + "rewards/length2tails_reward/std": 0.4041937291622162, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.0024813413619995, + "rewards/thermo_reward/std": 3.4010684490203857, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.71875, + "completions/mean_terminated_length": 269.71875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10291161388158798, + "epoch": 0.242, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13009320199489594, + "learning_rate": 1.995974293995239e-06, + "loss": -0.0034, + "num_tokens": 1047115.0, + "reward": 7.1537346839904785, + "reward_std": 8.474778175354004, + "rewards/fitness_reward/mean": 5.0962724685668945, + "rewards/fitness_reward/std": 4.272363662719727, + "rewards/kidney_reward/mean": 1.317286491394043, + "rewards/kidney_reward/std": 2.1928529739379883, + "rewards/length2tails_reward/mean": 0.5597797632217407, + "rewards/length2tails_reward/std": 0.38189437985420227, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 0.5904480814933777, + "rewards/thermo_reward/std": 2.832634449005127, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 267.1875, + "completions/mean_terminated_length": 267.1875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.10709732491523027, + "epoch": 0.244, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.523727536201477, + "learning_rate": 1.9958585322755417e-06, + "loss": -0.0344, + "num_tokens": 1055697.0, + "reward": 4.962468147277832, + "reward_std": 10.686628341674805, + "rewards/fitness_reward/mean": 3.538210868835449, + "rewards/fitness_reward/std": 5.383859157562256, + "rewards/kidney_reward/mean": 0.7698438763618469, + "rewards/kidney_reward/std": 2.587346076965332, + "rewards/length2tails_reward/mean": 0.7483822107315063, + "rewards/length2tails_reward/std": 0.33493828773498535, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.4795750379562378, + "rewards/thermo_reward/std": 3.2550032138824463, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.5, + "completions/mean_terminated_length": 269.5, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09620358562096953, + "epoch": 0.246, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08945641666650772, + "learning_rate": 1.9957411331167095e-06, + "loss": -0.0016, + "num_tokens": 1064353.0, + "reward": 8.262110710144043, + "reward_std": 8.13237190246582, + "rewards/fitness_reward/mean": 5.478936672210693, + "rewards/fitness_reward/std": 4.158201694488525, + "rewards/kidney_reward/mean": 1.3797082901000977, + "rewards/kidney_reward/std": 2.045048236846924, + "rewards/length2tails_reward/mean": 0.5142487287521362, + "rewards/length2tails_reward/std": 0.4123833477497101, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.2520408630371094, + "rewards/thermo_reward/std": 3.0139739513397217, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 268.8125, + "completions/mean_terminated_length": 268.8125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09582378715276718, + "epoch": 0.248, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11037473380565643, + "learning_rate": 1.9956220967117754e-06, + "loss": -0.0008, + "num_tokens": 1072987.0, + "reward": 7.818872928619385, + "reward_std": 9.085912704467773, + "rewards/fitness_reward/mean": 5.416874885559082, + "rewards/fitness_reward/std": 4.186379432678223, + "rewards/kidney_reward/mean": 1.1898185014724731, + "rewards/kidney_reward/std": 2.350635290145874, + "rewards/length2tails_reward/mean": 0.5090211629867554, + "rewards/length2tails_reward/std": 0.4061613976955414, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.0612778663635254, + "rewards/thermo_reward/std": 3.424208164215088, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.40625, + "completions/mean_terminated_length": 269.40625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10349469911307096, + "epoch": 0.25, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2761553227901459, + "learning_rate": 1.9955014232564663e-06, + "loss": -0.001, + "num_tokens": 1081640.0, + "reward": 11.220212936401367, + "reward_std": 4.9659504890441895, + "rewards/fitness_reward/mean": 6.890782356262207, + "rewards/fitness_reward/std": 2.0400962829589844, + "rewards/kidney_reward/mean": 2.115264415740967, + "rewards/kidney_reward/std": 1.3640040159225464, + "rewards/length2tails_reward/mean": 0.556138277053833, + "rewards/length2tails_reward/std": 0.32990503311157227, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.0585532188415527, + "rewards/thermo_reward/std": 2.3020107746124268, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.15625, + "completions/mean_terminated_length": 270.15625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10269631631672382, + "epoch": 0.252, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15978340804576874, + "learning_rate": 1.9953791129491983e-06, + "loss": -0.0021, + "num_tokens": 1090317.0, + "reward": 7.519195079803467, + "reward_std": 8.24329662322998, + "rewards/fitness_reward/mean": 5.177550315856934, + "rewards/fitness_reward/std": 4.349887371063232, + "rewards/kidney_reward/mean": 1.2797091007232666, + "rewards/kidney_reward/std": 2.0155045986175537, + "rewards/length2tails_reward/mean": 0.5976382493972778, + "rewards/length2tails_reward/std": 0.37005844712257385, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.9021725654602051, + "rewards/thermo_reward/std": 2.889704942703247, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.25, + "completions/mean_terminated_length": 270.25, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0951996436342597, + "epoch": 0.254, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0809130147099495, + "learning_rate": 1.99525516599108e-06, + "loss": 0.0034, + "num_tokens": 1098997.0, + "reward": 6.134978294372559, + "reward_std": 9.807641983032227, + "rewards/fitness_reward/mean": 4.49169397354126, + "rewards/fitness_reward/std": 4.9330644607543945, + "rewards/kidney_reward/mean": 1.0262835025787354, + "rewards/kidney_reward/std": 2.369699239730835, + "rewards/length2tails_reward/mean": 0.6029974222183228, + "rewards/length2tails_reward/std": 0.3860572874546051, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.4567016363143921, + "rewards/thermo_reward/std": 3.154005765914917, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.09375, + "completions/mean_terminated_length": 270.09375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10363792441785336, + "epoch": 0.256, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2734423875808716, + "learning_rate": 1.995129582585911e-06, + "loss": -0.0037, + "num_tokens": 1107672.0, + "reward": 10.065555572509766, + "reward_std": 7.323700428009033, + "rewards/fitness_reward/mean": 6.216155052185059, + "rewards/fitness_reward/std": 3.252866268157959, + "rewards/kidney_reward/mean": 1.8607220649719238, + "rewards/kidney_reward/std": 1.8087016344070435, + "rewards/length2tails_reward/mean": 0.5705130100250244, + "rewards/length2tails_reward/std": 0.37505844235420227, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.8316264152526855, + "rewards/thermo_reward/std": 2.7449350357055664, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0994394039735198, + "epoch": 0.258, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06736920028924942, + "learning_rate": 1.9950023629401823e-06, + "loss": -0.0011, + "num_tokens": 1116388.0, + "reward": 8.246410369873047, + "reward_std": 8.444506645202637, + "rewards/fitness_reward/mean": 5.335963726043701, + "rewards/fitness_reward/std": 4.293360233306885, + "rewards/kidney_reward/mean": 1.3580021858215332, + "rewards/kidney_reward/std": 1.9836363792419434, + "rewards/length2tails_reward/mean": 0.6779234409332275, + "rewards/length2tails_reward/std": 0.35614529252052307, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 1.390902042388916, + "rewards/thermo_reward/std": 2.899958372116089, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 265.25, + "completions/mean_terminated_length": 265.25, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.10844458639621735, + "epoch": 0.26, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5157034397125244, + "learning_rate": 1.9948735072630743e-06, + "loss": -0.0613, + "num_tokens": 1124908.0, + "reward": 7.14207124710083, + "reward_std": 9.764935493469238, + "rewards/fitness_reward/mean": 4.8439483642578125, + "rewards/fitness_reward/std": 4.8512396812438965, + "rewards/kidney_reward/mean": 1.2248245477676392, + "rewards/kidney_reward/std": 2.3549630641937256, + "rewards/length2tails_reward/mean": 0.6458985805511475, + "rewards/length2tails_reward/std": 0.39737337827682495, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.9087082147598267, + "rewards/thermo_reward/std": 3.279670000076294, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 270.9375, + "completions/mean_terminated_length": 270.9375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10960131324827671, + "epoch": 0.262, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06332565099000931, + "learning_rate": 1.9947430157664573e-06, + "loss": -0.0011, + "num_tokens": 1133610.0, + "reward": 10.147733688354492, + "reward_std": 7.310003757476807, + "rewards/fitness_reward/mean": 5.934072494506836, + "rewards/fitness_reward/std": 3.8383984565734863, + "rewards/kidney_reward/mean": 1.8624502420425415, + "rewards/kidney_reward/std": 1.6555120944976807, + "rewards/length2tails_reward/mean": 0.6436142325401306, + "rewards/length2tails_reward/std": 0.367124080657959, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.186849594116211, + "rewards/thermo_reward/std": 2.4702565670013428, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.1875, + "completions/mean_terminated_length": 271.1875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11062491126358509, + "epoch": 0.264, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11164043098688126, + "learning_rate": 1.9946108886648925e-06, + "loss": 0.0007, + "num_tokens": 1142320.0, + "reward": 8.521629333496094, + "reward_std": 8.78165054321289, + "rewards/fitness_reward/mean": 5.5364556312561035, + "rewards/fitness_reward/std": 4.161991119384766, + "rewards/kidney_reward/mean": 1.4564990997314453, + "rewards/kidney_reward/std": 2.1312105655670166, + "rewards/length2tails_reward/mean": 0.6381245851516724, + "rewards/length2tails_reward/std": 0.3892870545387268, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.3648622035980225, + "rewards/thermo_reward/std": 2.9711556434631348, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.9375, + "completions/mean_terminated_length": 270.9375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10500903520733118, + "epoch": 0.266, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06918232887983322, + "learning_rate": 1.994477126175629e-06, + "loss": -0.0014, + "num_tokens": 1151022.0, + "reward": 6.013577461242676, + "reward_std": 9.91647720336914, + "rewards/fitness_reward/mean": 4.206317901611328, + "rewards/fitness_reward/std": 5.031452655792236, + "rewards/kidney_reward/mean": 1.0269017219543457, + "rewards/kidney_reward/std": 2.391968250274658, + "rewards/length2tails_reward/mean": 0.6653873920440674, + "rewards/length2tails_reward/std": 0.3800601363182068, + "rewards/repeated_in_batch_reward/mean": 0.90625, + "rewards/repeated_in_batch_reward/std": 0.2961445748806, + "rewards/thermo_reward/mean": 0.6231940984725952, + "rewards/thermo_reward/std": 3.0378589630126953, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.375, + "completions/mean_terminated_length": 270.375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09510938823223114, + "epoch": 0.268, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10929831117391586, + "learning_rate": 1.994341728518606e-06, + "loss": 0.006, + "num_tokens": 1159706.0, + "reward": 4.943110942840576, + "reward_std": 10.271505355834961, + "rewards/fitness_reward/mean": 3.783754825592041, + "rewards/fitness_reward/std": 5.177763938903809, + "rewards/kidney_reward/mean": 0.6879823207855225, + "rewards/kidney_reward/std": 2.4497663974761963, + "rewards/length2tails_reward/mean": 0.5846083164215088, + "rewards/length2tails_reward/std": 0.44055286049842834, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.3129129707813263, + "rewards/thermo_reward/std": 3.2092156410217285, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 263.65625, + "completions/mean_terminated_length": 263.65625, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "entropy": 0.11222867853939533, + "epoch": 0.27, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32465440034866333, + "learning_rate": 1.994204695916451e-06, + "loss": -0.0471, + "num_tokens": 1168175.0, + "reward": 5.701061248779297, + "reward_std": 10.690949440002441, + "rewards/fitness_reward/mean": 3.6731255054473877, + "rewards/fitness_reward/std": 5.350979328155518, + "rewards/kidney_reward/mean": 0.8555735945701599, + "rewards/kidney_reward/std": 2.5589215755462646, + "rewards/length2tails_reward/mean": 0.7329180836677551, + "rewards/length2tails_reward/std": 0.31912535429000854, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.9990701675415039, + "rewards/thermo_reward/std": 3.3446438312530518, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 269.40625, + "completions/mean_terminated_length": 269.40625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09923252556473017, + "epoch": 0.272, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06646724790334702, + "learning_rate": 1.99406602859448e-06, + "loss": 0.0031, + "num_tokens": 1176828.0, + "reward": 8.432554244995117, + "reward_std": 7.6039862632751465, + "rewards/fitness_reward/mean": 5.432223320007324, + "rewards/fitness_reward/std": 4.257699966430664, + "rewards/kidney_reward/mean": 1.6723767518997192, + "rewards/kidney_reward/std": 1.8270021677017212, + "rewards/length2tails_reward/mean": 0.5134449005126953, + "rewards/length2tails_reward/std": 0.4071425497531891, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.1766104698181152, + "rewards/thermo_reward/std": 2.9092674255371094, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.0625, + "completions/mean_terminated_length": 270.0625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1019074423238635, + "epoch": 0.274, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06599363684654236, + "learning_rate": 1.9939257267806963e-06, + "loss": 0.0037, + "num_tokens": 1185502.0, + "reward": 6.927309989929199, + "reward_std": 9.177210807800293, + "rewards/fitness_reward/mean": 4.872681617736816, + "rewards/fitness_reward/std": 4.654974460601807, + "rewards/kidney_reward/mean": 1.2820192575454712, + "rewards/kidney_reward/std": 2.128758668899536, + "rewards/length2tails_reward/mean": 0.6186249256134033, + "rewards/length2tails_reward/std": 0.3422185778617859, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.6107470989227295, + "rewards/thermo_reward/std": 3.1053736209869385, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.15625, + "completions/mean_terminated_length": 269.15625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09770497446879745, + "epoch": 0.276, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0907408595085144, + "learning_rate": 1.9937837907057903e-06, + "loss": 0.0023, + "num_tokens": 1194147.0, + "reward": 8.242408752441406, + "reward_std": 7.911430835723877, + "rewards/fitness_reward/mean": 5.498637676239014, + "rewards/fitness_reward/std": 4.1017327308654785, + "rewards/kidney_reward/mean": 1.4611058235168457, + "rewards/kidney_reward/std": 1.9765921831130981, + "rewards/length2tails_reward/mean": 0.4971846342086792, + "rewards/length2tails_reward/std": 0.4135686755180359, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.1329469680786133, + "rewards/thermo_reward/std": 2.697221517562866, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.21875, + "completions/mean_terminated_length": 270.21875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10403876565396786, + "epoch": 0.278, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06455550342798233, + "learning_rate": 1.993640220603141e-06, + "loss": -0.0012, + "num_tokens": 1202826.0, + "reward": 10.303912162780762, + "reward_std": 6.239825248718262, + "rewards/fitness_reward/mean": 6.35327672958374, + "rewards/fitness_reward/std": 3.1890387535095215, + "rewards/kidney_reward/mean": 2.065305233001709, + "rewards/kidney_reward/std": 1.3484585285186768, + "rewards/length2tails_reward/mean": 0.5866552591323853, + "rewards/length2tails_reward/std": 0.38823702931404114, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.7266652584075928, + "rewards/thermo_reward/std": 2.520035743713379, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 270.25, + "completions/mean_terminated_length": 270.25, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09568120446056128, + "epoch": 0.28, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24781930446624756, + "learning_rate": 1.993495016708813e-06, + "loss": 0.0036, + "num_tokens": 1211506.0, + "reward": 7.731461048126221, + "reward_std": 8.551918983459473, + "rewards/fitness_reward/mean": 4.975318431854248, + "rewards/fitness_reward/std": 4.3804097175598145, + "rewards/kidney_reward/mean": 1.2619259357452393, + "rewards/kidney_reward/std": 2.03277850151062, + "rewards/length2tails_reward/mean": 0.4962129592895508, + "rewards/length2tails_reward/std": 0.44050833582878113, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.344595193862915, + "rewards/thermo_reward/std": 2.948279619216919, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.96875, + "completions/mean_terminated_length": 269.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09721511509269476, + "epoch": 0.282, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07676605135202408, + "learning_rate": 1.9933481792615583e-06, + "loss": 0.0034, + "num_tokens": 1220177.0, + "reward": 7.450370788574219, + "reward_std": 9.537321090698242, + "rewards/fitness_reward/mean": 4.905481338500977, + "rewards/fitness_reward/std": 4.722832202911377, + "rewards/kidney_reward/mean": 1.3217523097991943, + "rewards/kidney_reward/std": 2.265947103500366, + "rewards/length2tails_reward/mean": 0.5830744504928589, + "rewards/length2tails_reward/std": 0.3848836421966553, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.0648295879364014, + "rewards/thermo_reward/std": 3.070201873779297, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.21875, + "completions/mean_terminated_length": 269.21875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09513065218925476, + "epoch": 0.284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14675599336624146, + "learning_rate": 1.9931997085028128e-06, + "loss": -0.0064, + "num_tokens": 1228824.0, + "reward": 7.320596218109131, + "reward_std": 9.154064178466797, + "rewards/fitness_reward/mean": 5.166498184204102, + "rewards/fitness_reward/std": 4.510073184967041, + "rewards/kidney_reward/mean": 1.226707935333252, + "rewards/kidney_reward/std": 2.355365037918091, + "rewards/length2tails_reward/mean": 0.5027331709861755, + "rewards/length2tails_reward/std": 0.38256677985191345, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.7771167755126953, + "rewards/thermo_reward/std": 3.0694499015808105, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 264.46875, + "completions/mean_terminated_length": 264.46875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.10037917224690318, + "epoch": 0.286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.42658743262290955, + "learning_rate": 1.9930496046767007e-06, + "loss": -0.0316, + "num_tokens": 1237319.0, + "reward": 6.065443992614746, + "reward_std": 9.798822402954102, + "rewards/fitness_reward/mean": 4.448328495025635, + "rewards/fitness_reward/std": 5.0034284591674805, + "rewards/kidney_reward/mean": 0.932793378829956, + "rewards/kidney_reward/std": 2.4061214923858643, + "rewards/length2tails_reward/mean": 0.5339027047157288, + "rewards/length2tails_reward/std": 0.4264541268348694, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.5309317111968994, + "rewards/thermo_reward/std": 3.0993340015411377, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 272.78125, + "completions/mean_terminated_length": 272.78125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09499884769320488, + "epoch": 0.288, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3067867159843445, + "learning_rate": 1.9928978680300293e-06, + "loss": 0.0005, + "num_tokens": 1246080.0, + "reward": 4.834901809692383, + "reward_std": 10.813472747802734, + "rewards/fitness_reward/mean": 3.440321445465088, + "rewards/fitness_reward/std": 5.519410133361816, + "rewards/kidney_reward/mean": 0.6061902642250061, + "rewards/kidney_reward/std": 2.588884115219116, + "rewards/length2tails_reward/mean": 0.7453749179840088, + "rewards/length2tails_reward/std": 0.3469243347644806, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.6138525605201721, + "rewards/thermo_reward/std": 3.252837657928467, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.96875, + "completions/mean_terminated_length": 269.96875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.11109276209026575, + "epoch": 0.29, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.43782028555870056, + "learning_rate": 1.9927444988122917e-06, + "loss": -0.0001, + "num_tokens": 1254751.0, + "reward": 6.893056392669678, + "reward_std": 9.528643608093262, + "rewards/fitness_reward/mean": 4.82741641998291, + "rewards/fitness_reward/std": 4.739230155944824, + "rewards/kidney_reward/mean": 1.2925114631652832, + "rewards/kidney_reward/std": 2.267174482345581, + "rewards/length2tails_reward/mean": 0.6154762506484985, + "rewards/length2tails_reward/std": 0.3850167989730835, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.6115807294845581, + "rewards/thermo_reward/std": 3.185163974761963, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.5, + "completions/mean_terminated_length": 271.5, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10539795272052288, + "epoch": 0.292, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1180371344089508, + "learning_rate": 1.992589497275665e-06, + "loss": 0.001, + "num_tokens": 1263471.0, + "reward": 8.197807312011719, + "reward_std": 9.033075332641602, + "rewards/fitness_reward/mean": 5.30934476852417, + "rewards/fitness_reward/std": 4.3426971435546875, + "rewards/kidney_reward/mean": 1.4229786396026611, + "rewards/kidney_reward/std": 2.1725473403930664, + "rewards/length2tails_reward/mean": 0.7128552198410034, + "rewards/length2tails_reward/std": 0.3286351263523102, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.2941994667053223, + "rewards/thermo_reward/std": 2.925856590270996, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.78125, + "completions/mean_terminated_length": 269.78125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11158214416354895, + "epoch": 0.294, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08068758994340897, + "learning_rate": 1.99243286367501e-06, + "loss": 0.0001, + "num_tokens": 1272136.0, + "reward": 10.795753479003906, + "reward_std": 6.373899936676025, + "rewards/fitness_reward/mean": 6.341925621032715, + "rewards/fitness_reward/std": 3.2274014949798584, + "rewards/kidney_reward/mean": 2.1037323474884033, + "rewards/kidney_reward/std": 1.3544044494628906, + "rewards/length2tails_reward/mean": 0.6009948253631592, + "rewards/length2tails_reward/std": 0.3666466176509857, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.1899969577789307, + "rewards/thermo_reward/std": 2.2698984146118164, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.1875, + "completions/mean_terminated_length": 270.1875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10888968966901302, + "epoch": 0.296, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06824659556150436, + "learning_rate": 1.9922745982678716e-06, + "loss": -0.003, + "num_tokens": 1280814.0, + "reward": 8.972225189208984, + "reward_std": 6.666226387023926, + "rewards/fitness_reward/mean": 5.843311786651611, + "rewards/fitness_reward/std": 3.766287088394165, + "rewards/kidney_reward/mean": 1.5617635250091553, + "rewards/kidney_reward/std": 1.6977827548980713, + "rewards/length2tails_reward/mean": 0.566065788269043, + "rewards/length2tails_reward/std": 0.3749234974384308, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.4105430841445923, + "rewards/thermo_reward/std": 2.7509870529174805, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.34375, + "completions/mean_terminated_length": 270.34375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10543608199805021, + "epoch": 0.298, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.062118224799633026, + "learning_rate": 1.9921147013144777e-06, + "loss": 0.0018, + "num_tokens": 1289497.0, + "reward": 9.55693244934082, + "reward_std": 7.30353307723999, + "rewards/fitness_reward/mean": 6.239426612854004, + "rewards/fitness_reward/std": 3.359330177307129, + "rewards/kidney_reward/mean": 1.744706392288208, + "rewards/kidney_reward/std": 1.7386865615844727, + "rewards/length2tails_reward/mean": 0.6120542883872986, + "rewards/length2tails_reward/std": 0.3751331865787506, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.4115936756134033, + "rewards/thermo_reward/std": 3.0637288093566895, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 271.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10638347268104553, + "epoch": 0.3, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07195970416069031, + "learning_rate": 1.991953173077738e-06, + "loss": -0.0008, + "num_tokens": 1298215.0, + "reward": 11.724641799926758, + "reward_std": 4.774008750915527, + "rewards/fitness_reward/mean": 7.004184722900391, + "rewards/fitness_reward/std": 2.019498586654663, + "rewards/kidney_reward/mean": 2.293221950531006, + "rewards/kidney_reward/std": 1.1455600261688232, + "rewards/length2tails_reward/mean": 0.7141435146331787, + "rewards/length2tails_reward/std": 0.3148179054260254, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.2558200359344482, + "rewards/thermo_reward/std": 2.349818468093872, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 270.3125, + "completions/mean_terminated_length": 270.3125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.10944250784814358, + "epoch": 0.302, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13008372485637665, + "learning_rate": 1.991790013823246e-06, + "loss": -0.003, + "num_tokens": 1306897.0, + "reward": 9.20780086517334, + "reward_std": 7.8232316970825195, + "rewards/fitness_reward/mean": 5.768603324890137, + "rewards/fitness_reward/std": 4.000181674957275, + "rewards/kidney_reward/mean": 1.7911608219146729, + "rewards/kidney_reward/std": 1.9888068437576294, + "rewards/length2tails_reward/mean": 0.5865706205368042, + "rewards/length2tails_reward/std": 0.39467447996139526, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.4893792867660522, + "rewards/thermo_reward/std": 2.8161673545837402, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.40625, + "completions/mean_terminated_length": 270.40625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.10371760232374072, + "epoch": 0.304, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25305068492889404, + "learning_rate": 1.9916252238192755e-06, + "loss": -0.0012, + "num_tokens": 1315582.0, + "reward": 7.669151306152344, + "reward_std": 8.788591384887695, + "rewards/fitness_reward/mean": 5.513693809509277, + "rewards/fitness_reward/std": 4.228607654571533, + "rewards/kidney_reward/mean": 1.3250335454940796, + "rewards/kidney_reward/std": 2.290503740310669, + "rewards/length2tails_reward/mean": 0.6143225431442261, + "rewards/length2tails_reward/std": 0.39379915595054626, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.66899174451828, + "rewards/thermo_reward/std": 2.995438575744629, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.28125, + "completions/mean_terminated_length": 270.28125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1102075595408678, + "epoch": 0.306, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1603720337152481, + "learning_rate": 1.991458803336782e-06, + "loss": -0.0001, + "num_tokens": 1324263.0, + "reward": 9.525554656982422, + "reward_std": 6.73523473739624, + "rewards/fitness_reward/mean": 6.219589710235596, + "rewards/fitness_reward/std": 3.4215612411499023, + "rewards/kidney_reward/mean": 1.6574617624282837, + "rewards/kidney_reward/std": 1.7385835647583008, + "rewards/length2tails_reward/mean": 0.6276772022247314, + "rewards/length2tails_reward/std": 0.36669981479644775, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.4857351779937744, + "rewards/thermo_reward/std": 2.6110165119171143, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.1875, + "completions/mean_terminated_length": 270.1875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11156374774873257, + "epoch": 0.308, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06302237510681152, + "learning_rate": 1.9912907526494026e-06, + "loss": -0.0008, + "num_tokens": 1332941.0, + "reward": 9.845077514648438, + "reward_std": 6.1798095703125, + "rewards/fitness_reward/mean": 6.535126686096191, + "rewards/fitness_reward/std": 2.8043503761291504, + "rewards/kidney_reward/mean": 1.7477779388427734, + "rewards/kidney_reward/std": 1.621852993965149, + "rewards/length2tails_reward/mean": 0.5888516902923584, + "rewards/length2tails_reward/std": 0.3823993504047394, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.403287649154663, + "rewards/thermo_reward/std": 2.6505541801452637, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 268.71875, + "completions/mean_terminated_length": 268.71875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09071650542318821, + "epoch": 0.31, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1547355204820633, + "learning_rate": 1.9911210720334545e-06, + "loss": -0.0015, + "num_tokens": 1341572.0, + "reward": 8.669211387634277, + "reward_std": 8.183530807495117, + "rewards/fitness_reward/mean": 5.633013725280762, + "rewards/fitness_reward/std": 4.088678359985352, + "rewards/kidney_reward/mean": 1.656423807144165, + "rewards/kidney_reward/std": 1.9646308422088623, + "rewards/length2tails_reward/mean": 0.4567784070968628, + "rewards/length2tails_reward/std": 0.37301844358444214, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.2340962886810303, + "rewards/thermo_reward/std": 2.6925230026245117, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.3125, + "completions/mean_terminated_length": 269.3125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10991678666323423, + "epoch": 0.312, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13348576426506042, + "learning_rate": 1.9909497617679347e-06, + "loss": 0.0, + "num_tokens": 1350222.0, + "reward": 9.542964935302734, + "reward_std": 6.098407745361328, + "rewards/fitness_reward/mean": 6.580748558044434, + "rewards/fitness_reward/std": 2.6743195056915283, + "rewards/kidney_reward/mean": 1.6786231994628906, + "rewards/kidney_reward/std": 1.656028389930725, + "rewards/length2tails_reward/mean": 0.5179070234298706, + "rewards/length2tails_reward/std": 0.3983818292617798, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.1318026781082153, + "rewards/thermo_reward/std": 2.8300745487213135, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.625, + "completions/mean_terminated_length": 269.625, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "entropy": 0.10514571238309145, + "epoch": 0.314, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8836485147476196, + "learning_rate": 1.99077682213452e-06, + "loss": -0.0252, + "num_tokens": 1358882.0, + "reward": 11.270038604736328, + "reward_std": 6.071025848388672, + "rewards/fitness_reward/mean": 6.54879903793335, + "rewards/fitness_reward/std": 2.982909917831421, + "rewards/kidney_reward/mean": 2.0715489387512207, + "rewards/kidney_reward/std": 1.5798089504241943, + "rewards/length2tails_reward/mean": 0.6404703259468079, + "rewards/length2tails_reward/std": 0.3451295495033264, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.4856441020965576, + "rewards/thermo_reward/std": 2.362983465194702, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 267.6875, + "completions/mean_terminated_length": 267.6875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "entropy": 0.09205622784793377, + "epoch": 0.316, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12054961919784546, + "learning_rate": 1.9906022534175653e-06, + "loss": -0.0152, + "num_tokens": 1367480.0, + "reward": 5.936069965362549, + "reward_std": 8.101700782775879, + "rewards/fitness_reward/mean": 4.768802642822266, + "rewards/fitness_reward/std": 4.337706089019775, + "rewards/kidney_reward/mean": 0.7667664289474487, + "rewards/kidney_reward/std": 2.0656919479370117, + "rewards/length2tails_reward/mean": 0.5114237070083618, + "rewards/length2tails_reward/std": 0.4007585048675537, + "rewards/repeated_in_batch_reward/mean": 0.875, + "rewards/repeated_in_batch_reward/std": 0.33601075410842896, + "rewards/thermo_reward/mean": 0.261858731508255, + "rewards/thermo_reward/std": 3.0215578079223633, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.53125, + "completions/mean_terminated_length": 270.53125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11868708301335573, + "epoch": 0.318, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17192107439041138, + "learning_rate": 1.9904260559041067e-06, + "loss": 0.0031, + "num_tokens": 1376169.0, + "reward": 10.619016647338867, + "reward_std": 6.427189350128174, + "rewards/fitness_reward/mean": 6.33050012588501, + "rewards/fitness_reward/std": 3.0782217979431152, + "rewards/kidney_reward/mean": 1.9571011066436768, + "rewards/kidney_reward/std": 1.5345371961593628, + "rewards/length2tails_reward/mean": 0.639854907989502, + "rewards/length2tails_reward/std": 0.35456106066703796, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.1674301624298096, + "rewards/thermo_reward/std": 2.3626694679260254, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.25, + "completions/mean_terminated_length": 270.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10438992222771049, + "epoch": 0.32, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09375450015068054, + "learning_rate": 1.990248229883855e-06, + "loss": -0.0047, + "num_tokens": 1384849.0, + "reward": 8.270386695861816, + "reward_std": 7.849250793457031, + "rewards/fitness_reward/mean": 5.114340782165527, + "rewards/fitness_reward/std": 4.376468658447266, + "rewards/kidney_reward/mean": 1.6285489797592163, + "rewards/kidney_reward/std": 1.7222367525100708, + "rewards/length2tails_reward/mean": 0.5605906248092651, + "rewards/length2tails_reward/std": 0.39478322863578796, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.371437430381775, + "rewards/thermo_reward/std": 2.751140832901001, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.34375, + "completions/mean_terminated_length": 270.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1004866361618042, + "epoch": 0.322, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06271824985742569, + "learning_rate": 1.9900687756492018e-06, + "loss": -0.0021, + "num_tokens": 1393532.0, + "reward": 7.980679512023926, + "reward_std": 9.225789070129395, + "rewards/fitness_reward/mean": 4.897353649139404, + "rewards/fitness_reward/std": 4.632526397705078, + "rewards/kidney_reward/mean": 1.4106215238571167, + "rewards/kidney_reward/std": 2.1333272457122803, + "rewards/length2tails_reward/mean": 0.6001532077789307, + "rewards/length2tails_reward/std": 0.36171630024909973, + "rewards/repeated_in_batch_reward/mean": 0.90625, + "rewards/repeated_in_batch_reward/std": 0.2961445748806, + "rewards/thermo_reward/mean": 1.5220634937286377, + "rewards/thermo_reward/std": 2.998164176940918, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 265.5625, + "completions/mean_terminated_length": 265.5625, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "entropy": 0.11199045088142157, + "epoch": 0.324, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3692063093185425, + "learning_rate": 1.9898876934952135e-06, + "loss": -0.0394, + "num_tokens": 1402062.0, + "reward": 8.863862991333008, + "reward_std": 6.968045711517334, + "rewards/fitness_reward/mean": 5.994227409362793, + "rewards/fitness_reward/std": 3.6847662925720215, + "rewards/kidney_reward/mean": 1.8423200845718384, + "rewards/kidney_reward/std": 1.663252830505371, + "rewards/length2tails_reward/mean": 0.4631441533565521, + "rewards/length2tails_reward/std": 0.41804710030555725, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.8810014724731445, + "rewards/thermo_reward/std": 2.657252311706543, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.375, + "completions/mean_terminated_length": 269.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09533638320863247, + "epoch": 0.326, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.051175836473703384, + "learning_rate": 1.9897049837196347e-06, + "loss": 0.003, + "num_tokens": 1410714.0, + "reward": 8.676294326782227, + "reward_std": 8.282102584838867, + "rewards/fitness_reward/mean": 5.569098949432373, + "rewards/fitness_reward/std": 4.0911359786987305, + "rewards/kidney_reward/mean": 1.6246623992919922, + "rewards/kidney_reward/std": 1.9904649257659912, + "rewards/length2tails_reward/mean": 0.5004905462265015, + "rewards/length2tails_reward/std": 0.3728918433189392, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.3324849605560303, + "rewards/thermo_reward/std": 2.884413242340088, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.28125, + "completions/mean_terminated_length": 270.28125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10455688275396824, + "epoch": 0.328, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07528374344110489, + "learning_rate": 1.989520646622886e-06, + "loss": 0.0024, + "num_tokens": 1419395.0, + "reward": 6.963508605957031, + "reward_std": 9.04922103881836, + "rewards/fitness_reward/mean": 4.957913398742676, + "rewards/fitness_reward/std": 4.619761943817139, + "rewards/kidney_reward/mean": 1.1970510482788086, + "rewards/kidney_reward/std": 2.2081282138824463, + "rewards/length2tails_reward/mean": 0.6426414251327515, + "rewards/length2tails_reward/std": 0.3601728081703186, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.6442803144454956, + "rewards/thermo_reward/std": 2.9896538257598877, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.53125, + "completions/mean_terminated_length": 269.53125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1044129366055131, + "epoch": 0.33, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11145590245723724, + "learning_rate": 1.989334682508062e-06, + "loss": 0.0031, + "num_tokens": 1428052.0, + "reward": 8.322443008422852, + "reward_std": 8.941901206970215, + "rewards/fitness_reward/mean": 5.196577072143555, + "rewards/fitness_reward/std": 4.4512152671813965, + "rewards/kidney_reward/mean": 1.5438371896743774, + "rewards/kidney_reward/std": 2.067178726196289, + "rewards/length2tails_reward/mean": 0.5185511708259583, + "rewards/length2tails_reward/std": 0.39966416358947754, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.4301741123199463, + "rewards/thermo_reward/std": 2.9776289463043213, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 269.90625, + "completions/mean_terminated_length": 269.90625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10911750234663486, + "epoch": 0.332, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4272436201572418, + "learning_rate": 1.989147091680936e-06, + "loss": 0.0016, + "num_tokens": 1436721.0, + "reward": 11.553678512573242, + "reward_std": 6.220656394958496, + "rewards/fitness_reward/mean": 6.640511989593506, + "rewards/fitness_reward/std": 2.8358936309814453, + "rewards/kidney_reward/mean": 2.229365587234497, + "rewards/kidney_reward/std": 1.4573208093643188, + "rewards/length2tails_reward/mean": 0.6149606704711914, + "rewards/length2tails_reward/std": 0.34033235907554626, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.5223050117492676, + "rewards/thermo_reward/std": 2.36811900138855, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.0, + "completions/mean_terminated_length": 270.0, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09208371117711067, + "epoch": 0.334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13034819066524506, + "learning_rate": 1.988957874449953e-06, + "loss": 0.0036, + "num_tokens": 1445393.0, + "reward": 8.939111709594727, + "reward_std": 9.140274047851562, + "rewards/fitness_reward/mean": 5.218906402587891, + "rewards/fitness_reward/std": 4.415480613708496, + "rewards/kidney_reward/mean": 1.5986276865005493, + "rewards/kidney_reward/std": 2.1802477836608887, + "rewards/length2tails_reward/mean": 0.5977396965026855, + "rewards/length2tails_reward/std": 0.3474375307559967, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.961803674697876, + "rewards/thermo_reward/std": 2.926722764968872, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1023161243647337, + "epoch": 0.336, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05429290235042572, + "learning_rate": 1.9887670311262328e-06, + "loss": -0.0021, + "num_tokens": 1454109.0, + "reward": 11.80929183959961, + "reward_std": 4.472561836242676, + "rewards/fitness_reward/mean": 6.948788166046143, + "rewards/fitness_reward/std": 2.023350238800049, + "rewards/kidney_reward/mean": 2.316553831100464, + "rewards/kidney_reward/std": 1.0072654485702515, + "rewards/length2tails_reward/mean": 0.6750909686088562, + "rewards/length2tails_reward/std": 0.3512890338897705, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.376441478729248, + "rewards/thermo_reward/std": 2.003629684448242, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10454714018851519, + "epoch": 0.338, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1053980216383934, + "learning_rate": 1.9885745620235697e-06, + "loss": -0.0068, + "num_tokens": 1462844.0, + "reward": 9.110237121582031, + "reward_std": 7.532092094421387, + "rewards/fitness_reward/mean": 5.710217475891113, + "rewards/fitness_reward/std": 3.8100883960723877, + "rewards/kidney_reward/mean": 1.6665058135986328, + "rewards/kidney_reward/std": 1.8492753505706787, + "rewards/length2tails_reward/mean": 0.6642658710479736, + "rewards/length2tails_reward/std": 0.39535000920295715, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.567087173461914, + "rewards/thermo_reward/std": 2.6299617290496826, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.25, + "completions/mean_terminated_length": 270.25, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08847399707883596, + "epoch": 0.34, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07097867131233215, + "learning_rate": 1.9883804674584306e-06, + "loss": 0.0026, + "num_tokens": 1471524.0, + "reward": 6.478607177734375, + "reward_std": 9.610648155212402, + "rewards/fitness_reward/mean": 4.710236549377441, + "rewards/fitness_reward/std": 4.720317840576172, + "rewards/kidney_reward/mean": 0.9129420518875122, + "rewards/kidney_reward/std": 2.3220455646514893, + "rewards/length2tails_reward/mean": 0.5628418922424316, + "rewards/length2tails_reward/std": 0.41073399782180786, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.6991442441940308, + "rewards/thermo_reward/std": 3.20623517036438, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.21875, + "completions/mean_terminated_length": 273.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11202279198914766, + "epoch": 0.342, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11420899629592896, + "learning_rate": 1.9881847477499555e-06, + "loss": 0.0025, + "num_tokens": 1480299.0, + "reward": 7.849916458129883, + "reward_std": 9.838308334350586, + "rewards/fitness_reward/mean": 4.82754373550415, + "rewards/fitness_reward/std": 4.876658916473389, + "rewards/kidney_reward/mean": 1.3583157062530518, + "rewards/kidney_reward/std": 2.398590564727783, + "rewards/length2tails_reward/mean": 0.7617112994194031, + "rewards/length2tails_reward/std": 0.3424564301967621, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.4878864288330078, + "rewards/thermo_reward/std": 3.0041966438293457, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 271.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10550755634903908, + "epoch": 0.344, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4681124985218048, + "learning_rate": 1.9879874032199563e-06, + "loss": 0.003, + "num_tokens": 1489017.0, + "reward": 7.871708869934082, + "reward_std": 9.376019477844238, + "rewards/fitness_reward/mean": 5.116046905517578, + "rewards/fitness_reward/std": 4.47206449508667, + "rewards/kidney_reward/mean": 1.302150011062622, + "rewards/kidney_reward/std": 2.3035614490509033, + "rewards/length2tails_reward/mean": 0.6878268718719482, + "rewards/length2tails_reward/std": 0.35356101393699646, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.2847297191619873, + "rewards/thermo_reward/std": 2.9786298274993896, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.875, + "completions/mean_terminated_length": 270.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10861378256231546, + "epoch": 0.346, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10255390405654907, + "learning_rate": 1.9877884341929164e-06, + "loss": -0.0002, + "num_tokens": 1497717.0, + "reward": 10.451683044433594, + "reward_std": 6.380789279937744, + "rewards/fitness_reward/mean": 6.58067512512207, + "rewards/fitness_reward/std": 2.850431442260742, + "rewards/kidney_reward/mean": 1.9992852210998535, + "rewards/kidney_reward/std": 1.6703037023544312, + "rewards/length2tails_reward/mean": 0.642656683921814, + "rewards/length2tails_reward/std": 0.3293820917606354, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.7074573040008545, + "rewards/thermo_reward/std": 2.617042064666748, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.75, + "completions/mean_terminated_length": 269.75, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10535236541181803, + "epoch": 0.348, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1874845027923584, + "learning_rate": 1.9875878409959902e-06, + "loss": 0.0, + "num_tokens": 1506381.0, + "reward": 11.276988983154297, + "reward_std": 4.965340614318848, + "rewards/fitness_reward/mean": 6.700150489807129, + "rewards/fitness_reward/std": 2.6060447692871094, + "rewards/kidney_reward/mean": 2.1925644874572754, + "rewards/kidney_reward/std": 1.1760989427566528, + "rewards/length2tails_reward/mean": 0.5278898477554321, + "rewards/length2tails_reward/std": 0.39283841848373413, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.23148512840271, + "rewards/thermo_reward/std": 2.2584495544433594, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.1875, + "completions/mean_terminated_length": 270.1875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1035816827788949, + "epoch": 0.35, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06968332827091217, + "learning_rate": 1.9873856239590034e-06, + "loss": -0.0035, + "num_tokens": 1515059.0, + "reward": 9.632209777832031, + "reward_std": 7.117125988006592, + "rewards/fitness_reward/mean": 5.655714988708496, + "rewards/fitness_reward/std": 4.042262077331543, + "rewards/kidney_reward/mean": 1.9653785228729248, + "rewards/kidney_reward/std": 1.48482084274292, + "rewards/length2tails_reward/mean": 0.5557355284690857, + "rewards/length2tails_reward/std": 0.4126896262168884, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 1.8617929220199585, + "rewards/thermo_reward/std": 2.4798786640167236, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.59375, + "completions/mean_terminated_length": 271.59375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09991635009646416, + "epoch": 0.352, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07047705352306366, + "learning_rate": 1.98718178341445e-06, + "loss": -0.0047, + "num_tokens": 1523782.0, + "reward": 9.619202613830566, + "reward_std": 7.000319004058838, + "rewards/fitness_reward/mean": 5.932534694671631, + "rewards/fitness_reward/std": 3.6836514472961426, + "rewards/kidney_reward/mean": 1.84051513671875, + "rewards/kidney_reward/std": 1.6164661645889282, + "rewards/length2tails_reward/mean": 0.6280834674835205, + "rewards/length2tails_reward/std": 0.4062521755695343, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.6833446025848389, + "rewards/thermo_reward/std": 2.712756395339966, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.125, + "completions/mean_terminated_length": 270.125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09833968244493008, + "epoch": 0.354, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06103222444653511, + "learning_rate": 1.9869763196974956e-06, + "loss": -0.0038, + "num_tokens": 1532458.0, + "reward": 8.60630989074707, + "reward_std": 8.169486999511719, + "rewards/fitness_reward/mean": 5.390318393707275, + "rewards/fitness_reward/std": 4.1763691902160645, + "rewards/kidney_reward/mean": 1.568656325340271, + "rewards/kidney_reward/std": 1.8943867683410645, + "rewards/length2tails_reward/mean": 0.5682525634765625, + "rewards/length2tails_reward/std": 0.4164934456348419, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 1.4967609643936157, + "rewards/thermo_reward/std": 2.6284830570220947, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 270.3125, + "completions/mean_terminated_length": 270.3125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10469309892505407, + "epoch": 0.356, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15595051646232605, + "learning_rate": 1.9867692331459733e-06, + "loss": -0.0012, + "num_tokens": 1541140.0, + "reward": 7.887426376342773, + "reward_std": 8.45262336730957, + "rewards/fitness_reward/mean": 5.164236068725586, + "rewards/fitness_reward/std": 4.271865367889404, + "rewards/kidney_reward/mean": 1.5095407962799072, + "rewards/kidney_reward/std": 1.9843440055847168, + "rewards/length2tails_reward/mean": 0.5877651572227478, + "rewards/length2tails_reward/std": 0.36727017164230347, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.0548732280731201, + "rewards/thermo_reward/std": 2.857112169265747, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.78125, + "completions/mean_terminated_length": 271.78125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1056223763152957, + "epoch": 0.358, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06294183433055878, + "learning_rate": 1.9865605241003845e-06, + "loss": -0.0045, + "num_tokens": 1549869.0, + "reward": 10.02109146118164, + "reward_std": 6.616111755371094, + "rewards/fitness_reward/mean": 6.333710670471191, + "rewards/fitness_reward/std": 3.2548043727874756, + "rewards/kidney_reward/mean": 1.9349606037139893, + "rewards/kidney_reward/std": 1.5588946342468262, + "rewards/length2tails_reward/mean": 0.681917667388916, + "rewards/length2tails_reward/std": 0.3754919469356537, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.584228515625, + "rewards/thermo_reward/std": 2.648305892944336, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.09375, + "completions/mean_terminated_length": 271.09375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10342816868796945, + "epoch": 0.36, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0756261870265007, + "learning_rate": 1.9863501929038997e-06, + "loss": -0.0007, + "num_tokens": 1558576.0, + "reward": 6.979414463043213, + "reward_std": 9.660487174987793, + "rewards/fitness_reward/mean": 4.5425872802734375, + "rewards/fitness_reward/std": 4.841700553894043, + "rewards/kidney_reward/mean": 1.0623948574066162, + "rewards/kidney_reward/std": 2.2623515129089355, + "rewards/length2tails_reward/mean": 0.658684253692627, + "rewards/length2tails_reward/std": 0.381984144449234, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.2085641622543335, + "rewards/thermo_reward/std": 3.015942096710205, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 271.875, + "completions/mean_terminated_length": 271.875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1091130031272769, + "epoch": 0.362, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20066675543785095, + "learning_rate": 1.9861382399023546e-06, + "loss": -0.0014, + "num_tokens": 1567308.0, + "reward": 9.064428329467773, + "reward_std": 7.4316086769104, + "rewards/fitness_reward/mean": 6.176366329193115, + "rewards/fitness_reward/std": 3.3721847534179688, + "rewards/kidney_reward/mean": 1.4785151481628418, + "rewards/kidney_reward/std": 1.8624529838562012, + "rewards/length2tails_reward/mean": 0.6511343121528625, + "rewards/length2tails_reward/std": 0.39212942123413086, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.2444343566894531, + "rewards/thermo_reward/std": 2.9861016273498535, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 271.21875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10733828134834766, + "epoch": 0.364, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.089052215218544, + "learning_rate": 1.985924665444254e-06, + "loss": -0.0008, + "num_tokens": 1576019.0, + "reward": 7.983680725097656, + "reward_std": 8.451763153076172, + "rewards/fitness_reward/mean": 5.157064437866211, + "rewards/fitness_reward/std": 4.3942108154296875, + "rewards/kidney_reward/mean": 1.4362343549728394, + "rewards/kidney_reward/std": 2.0755233764648438, + "rewards/length2tails_reward/mean": 0.6398859620094299, + "rewards/length2tails_reward/std": 0.39266476035118103, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.2263935804367065, + "rewards/thermo_reward/std": 2.855211019515991, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 268.6875, + "completions/mean_terminated_length": 268.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0979474326595664, + "epoch": 0.366, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07531240582466125, + "learning_rate": 1.9857094698807663e-06, + "loss": -0.0012, + "num_tokens": 1584649.0, + "reward": 9.46112060546875, + "reward_std": 7.812624454498291, + "rewards/fitness_reward/mean": 5.858367919921875, + "rewards/fitness_reward/std": 3.7519853115081787, + "rewards/kidney_reward/mean": 1.7861301898956299, + "rewards/kidney_reward/std": 1.8386919498443604, + "rewards/length2tails_reward/mean": 0.4857766628265381, + "rewards/length2tails_reward/std": 0.35673651099205017, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.6680454015731812, + "rewards/thermo_reward/std": 2.858015298843384, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.375, + "completions/mean_terminated_length": 269.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10918328817933798, + "epoch": 0.368, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08639601618051529, + "learning_rate": 1.9854926535657268e-06, + "loss": 0.0024, + "num_tokens": 1593301.0, + "reward": 8.301219940185547, + "reward_std": 7.373518943786621, + "rewards/fitness_reward/mean": 6.095327377319336, + "rewards/fitness_reward/std": 3.293491840362549, + "rewards/kidney_reward/mean": 1.462351679801941, + "rewards/kidney_reward/std": 1.9810431003570557, + "rewards/length2tails_reward/mean": 0.5736892223358154, + "rewards/length2tails_reward/std": 0.3360866606235504, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.5861721634864807, + "rewards/thermo_reward/std": 3.047858953475952, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 271.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09814856946468353, + "epoch": 0.37, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27271583676338196, + "learning_rate": 1.9852742168556353e-06, + "loss": -0.0028, + "num_tokens": 1602012.0, + "reward": 6.0901641845703125, + "reward_std": 10.192516326904297, + "rewards/fitness_reward/mean": 3.8694372177124023, + "rewards/fitness_reward/std": 5.288331031799316, + "rewards/kidney_reward/mean": 0.9918411374092102, + "rewards/kidney_reward/std": 2.3886590003967285, + "rewards/length2tails_reward/mean": 0.6225963830947876, + "rewards/length2tails_reward/std": 0.40456104278564453, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 1.0728765726089478, + "rewards/thermo_reward/std": 3.126966953277588, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.53125, + "completions/mean_terminated_length": 270.53125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10325660184025764, + "epoch": 0.372, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5015514492988586, + "learning_rate": 1.9850541601096565e-06, + "loss": -0.0007, + "num_tokens": 1610701.0, + "reward": 7.214511871337891, + "reward_std": 9.890998840332031, + "rewards/fitness_reward/mean": 4.450111389160156, + "rewards/fitness_reward/std": 5.011604309082031, + "rewards/kidney_reward/mean": 1.2150135040283203, + "rewards/kidney_reward/std": 2.3736073970794678, + "rewards/length2tails_reward/mean": 0.5748029351234436, + "rewards/length2tails_reward/std": 0.4244406521320343, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.391906976699829, + "rewards/thermo_reward/std": 3.0063281059265137, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.15625, + "completions/mean_terminated_length": 272.15625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10104412585496902, + "epoch": 0.374, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09444596618413925, + "learning_rate": 1.9848324836896185e-06, + "loss": -0.0007, + "num_tokens": 1619442.0, + "reward": 8.749990463256836, + "reward_std": 8.362157821655273, + "rewards/fitness_reward/mean": 5.2926411628723145, + "rewards/fitness_reward/std": 4.387931823730469, + "rewards/kidney_reward/mean": 1.7029788494110107, + "rewards/kidney_reward/std": 1.8882851600646973, + "rewards/length2tails_reward/mean": 0.6900187134742737, + "rewards/length2tails_reward/std": 0.3773564100265503, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.5853676795959473, + "rewards/thermo_reward/std": 2.8135106563568115, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 268.25, + "completions/mean_terminated_length": 268.25, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.09201871184632182, + "epoch": 0.376, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19163833558559418, + "learning_rate": 1.9846091879600123e-06, + "loss": -0.0053, + "num_tokens": 1628058.0, + "reward": 7.008411884307861, + "reward_std": 10.034730911254883, + "rewards/fitness_reward/mean": 4.643359661102295, + "rewards/fitness_reward/std": 4.732275009155273, + "rewards/kidney_reward/mean": 1.0078006982803345, + "rewards/kidney_reward/std": 2.456132411956787, + "rewards/length2tails_reward/mean": 0.5096907615661621, + "rewards/length2tails_reward/std": 0.4166960120201111, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.2062824964523315, + "rewards/thermo_reward/std": 3.2689130306243896, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.625, + "completions/mean_terminated_length": 270.625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10740740410983562, + "epoch": 0.378, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0717017725110054, + "learning_rate": 1.9843842732879912e-06, + "loss": 0.0008, + "num_tokens": 1636750.0, + "reward": 8.669443130493164, + "reward_std": 8.252121925354004, + "rewards/fitness_reward/mean": 5.560091018676758, + "rewards/fitness_reward/std": 4.254537105560303, + "rewards/kidney_reward/mean": 1.642980933189392, + "rewards/kidney_reward/std": 2.0037031173706055, + "rewards/length2tails_reward/mean": 0.6206031441688538, + "rewards/length2tails_reward/std": 0.38836368918418884, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.3043103218078613, + "rewards/thermo_reward/std": 2.909794330596924, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 270.90625, + "completions/mean_terminated_length": 270.90625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09697377681732178, + "epoch": 0.38, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10256364941596985, + "learning_rate": 1.9841577400433715e-06, + "loss": -0.0019, + "num_tokens": 1645451.0, + "reward": 11.645931243896484, + "reward_std": 4.940904140472412, + "rewards/fitness_reward/mean": 6.897823810577393, + "rewards/fitness_reward/std": 2.0551373958587646, + "rewards/kidney_reward/mean": 2.0693302154541016, + "rewards/kidney_reward/std": 1.310836911201477, + "rewards/length2tails_reward/mean": 0.588446855545044, + "rewards/length2tails_reward/std": 0.38965609669685364, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.5199332237243652, + "rewards/thermo_reward/std": 2.15556001663208, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 265.5625, + "completions/mean_terminated_length": 265.5625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.1106615662574768, + "epoch": 0.382, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5850204229354858, + "learning_rate": 1.9839295885986295e-06, + "loss": -0.0808, + "num_tokens": 1653981.0, + "reward": 10.428827285766602, + "reward_std": 6.0077056884765625, + "rewards/fitness_reward/mean": 6.506265640258789, + "rewards/fitness_reward/std": 2.7486143112182617, + "rewards/kidney_reward/mean": 2.0006251335144043, + "rewards/kidney_reward/std": 1.3787603378295898, + "rewards/length2tails_reward/mean": 0.6331422328948975, + "rewards/length2tails_reward/std": 0.37140002846717834, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.7586220502853394, + "rewards/thermo_reward/std": 2.7680184841156006, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.90625, + "completions/mean_terminated_length": 269.90625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10634398553520441, + "epoch": 0.384, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07652062177658081, + "learning_rate": 1.9836998193289038e-06, + "loss": -0.0068, + "num_tokens": 1662650.0, + "reward": 10.72802448272705, + "reward_std": 5.3211283683776855, + "rewards/fitness_reward/mean": 6.853697299957275, + "rewards/fitness_reward/std": 1.9966347217559814, + "rewards/kidney_reward/mean": 1.8450117111206055, + "rewards/kidney_reward/std": 1.6589022874832153, + "rewards/length2tails_reward/mean": 0.5354677438735962, + "rewards/length2tails_reward/std": 0.4196150004863739, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.875769019126892, + "rewards/thermo_reward/std": 2.499567985534668, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.875, + "completions/mean_terminated_length": 271.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1068008397705853, + "epoch": 0.386, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1525895595550537, + "learning_rate": 1.9834684326119915e-06, + "loss": -0.0025, + "num_tokens": 1671382.0, + "reward": 8.363931655883789, + "reward_std": 8.365130424499512, + "rewards/fitness_reward/mean": 5.590615272521973, + "rewards/fitness_reward/std": 4.040146350860596, + "rewards/kidney_reward/mean": 1.3528531789779663, + "rewards/kidney_reward/std": 2.032255172729492, + "rewards/length2tails_reward/mean": 0.6880743503570557, + "rewards/length2tails_reward/std": 0.36803537607192993, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.2516562938690186, + "rewards/thermo_reward/std": 3.0139715671539307, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.90625, + "completions/mean_terminated_length": 272.90625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.10786493681371212, + "epoch": 0.388, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1900808960199356, + "learning_rate": 1.98323542882835e-06, + "loss": 0.0009, + "num_tokens": 1680147.0, + "reward": 6.575066566467285, + "reward_std": 9.77808666229248, + "rewards/fitness_reward/mean": 4.85727071762085, + "rewards/fitness_reward/std": 4.808586597442627, + "rewards/kidney_reward/mean": 1.0090776681900024, + "rewards/kidney_reward/std": 2.365453004837036, + "rewards/length2tails_reward/mean": 0.7621837258338928, + "rewards/length2tails_reward/std": 0.32337686419487, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.532500147819519, + "rewards/thermo_reward/std": 3.1945226192474365, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.8125, + "completions/mean_terminated_length": 270.8125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10732665937393904, + "epoch": 0.39, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06709634512662888, + "learning_rate": 1.9830008083610964e-06, + "loss": -0.0003, + "num_tokens": 1688845.0, + "reward": 8.791130065917969, + "reward_std": 8.393945693969727, + "rewards/fitness_reward/mean": 5.577649116516113, + "rewards/fitness_reward/std": 4.212261199951172, + "rewards/kidney_reward/mean": 1.5661720037460327, + "rewards/kidney_reward/std": 1.9226521253585815, + "rewards/length2tails_reward/mean": 0.6098302602767944, + "rewards/length2tails_reward/std": 0.40974730253219604, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.4863260984420776, + "rewards/thermo_reward/std": 2.968444585800171, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.6875, + "completions/mean_terminated_length": 270.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10089261550456285, + "epoch": 0.392, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14002718031406403, + "learning_rate": 1.982764571596004e-06, + "loss": 0.0004, + "num_tokens": 1697539.0, + "reward": 8.121866226196289, + "reward_std": 8.40750789642334, + "rewards/fitness_reward/mean": 5.079010009765625, + "rewards/fitness_reward/std": 4.419788360595703, + "rewards/kidney_reward/mean": 1.4961414337158203, + "rewards/kidney_reward/std": 1.9750374555587769, + "rewards/length2tails_reward/mean": 0.6118898391723633, + "rewards/length2tails_reward/std": 0.39571040868759155, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.3855266571044922, + "rewards/thermo_reward/std": 2.767749786376953, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.65625, + "completions/mean_terminated_length": 269.65625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0964017822407186, + "epoch": 0.394, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1263321042060852, + "learning_rate": 1.9825267189215047e-06, + "loss": -0.0012, + "num_tokens": 1706200.0, + "reward": 8.513752937316895, + "reward_std": 8.518855094909668, + "rewards/fitness_reward/mean": 5.270111083984375, + "rewards/fitness_reward/std": 4.430210590362549, + "rewards/kidney_reward/mean": 1.7094039916992188, + "rewards/kidney_reward/std": 1.8589138984680176, + "rewards/length2tails_reward/mean": 0.5403980612754822, + "rewards/length2tails_reward/std": 0.3900211453437805, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.3801982402801514, + "rewards/thermo_reward/std": 2.656468629837036, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.4375, + "completions/mean_terminated_length": 272.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11020283959805965, + "epoch": 0.396, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06863213330507278, + "learning_rate": 1.9822872507286887e-06, + "loss": -0.0007, + "num_tokens": 1714950.0, + "reward": 7.311960697174072, + "reward_std": 9.5090970993042, + "rewards/fitness_reward/mean": 4.814295768737793, + "rewards/fitness_reward/std": 4.636486530303955, + "rewards/kidney_reward/mean": 1.1605840921401978, + "rewards/kidney_reward/std": 2.2890994548797607, + "rewards/length2tails_reward/mean": 0.7226451635360718, + "rewards/length2tails_reward/std": 0.33867594599723816, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.1648168563842773, + "rewards/thermo_reward/std": 3.1884403228759766, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.875, + "completions/mean_terminated_length": 269.875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.109826167114079, + "epoch": 0.398, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20804084837436676, + "learning_rate": 1.9820461674113e-06, + "loss": -0.0033, + "num_tokens": 1723618.0, + "reward": 7.401235580444336, + "reward_std": 8.667125701904297, + "rewards/fitness_reward/mean": 5.332418441772461, + "rewards/fitness_reward/std": 4.303309440612793, + "rewards/kidney_reward/mean": 1.2573118209838867, + "rewards/kidney_reward/std": 2.1245853900909424, + "rewards/length2tails_reward/mean": 0.5858131051063538, + "rewards/length2tails_reward/std": 0.3964066207408905, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.6529242992401123, + "rewards/thermo_reward/std": 3.1356236934661865, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.46875, + "completions/mean_terminated_length": 270.46875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0995940687134862, + "epoch": 0.4, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14407816529273987, + "learning_rate": 1.9818034693657404e-06, + "loss": -0.0018, + "num_tokens": 1732305.0, + "reward": 10.46460247039795, + "reward_std": 7.0780768394470215, + "rewards/fitness_reward/mean": 6.211520195007324, + "rewards/fitness_reward/std": 3.2666735649108887, + "rewards/kidney_reward/mean": 1.8888168334960938, + "rewards/kidney_reward/std": 1.7426211833953857, + "rewards/length2tails_reward/mean": 0.5859594345092773, + "rewards/length2tails_reward/std": 0.3815963566303253, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.2056691646575928, + "rewards/thermo_reward/std": 2.506162405014038, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.5625, + "completions/mean_terminated_length": 269.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09627026505768299, + "epoch": 0.402, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1052074134349823, + "learning_rate": 1.9815591569910653e-06, + "loss": -0.0033, + "num_tokens": 1740963.0, + "reward": 8.757131576538086, + "reward_std": 8.111889839172363, + "rewards/fitness_reward/mean": 5.644378185272217, + "rewards/fitness_reward/std": 3.914625644683838, + "rewards/kidney_reward/mean": 1.5931066274642944, + "rewards/kidney_reward/std": 1.9486136436462402, + "rewards/length2tails_reward/mean": 0.5394684076309204, + "rewards/length2tails_reward/std": 0.38302308320999146, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.365700364112854, + "rewards/thermo_reward/std": 2.8859643936157227, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.1875, + "completions/mean_terminated_length": 270.1875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10567681677639484, + "epoch": 0.404, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08214101195335388, + "learning_rate": 1.9813132306889854e-06, + "loss": -0.0019, + "num_tokens": 1749641.0, + "reward": 11.298721313476562, + "reward_std": 4.785417079925537, + "rewards/fitness_reward/mean": 7.003148555755615, + "rewards/fitness_reward/std": 2.025360345840454, + "rewards/kidney_reward/mean": 2.3104147911071777, + "rewards/kidney_reward/std": 1.040230393409729, + "rewards/length2tails_reward/mean": 0.5893241763114929, + "rewards/length2tails_reward/std": 0.381794810295105, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.8262252807617188, + "rewards/thermo_reward/std": 2.564194679260254, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.53125, + "completions/mean_terminated_length": 270.53125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1086399257183075, + "epoch": 0.406, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.111503966152668, + "learning_rate": 1.981065690863864e-06, + "loss": 0.0013, + "num_tokens": 1758330.0, + "reward": 8.51333236694336, + "reward_std": 8.681916236877441, + "rewards/fitness_reward/mean": 5.229581832885742, + "rewards/fitness_reward/std": 4.509924411773682, + "rewards/kidney_reward/mean": 1.6099677085876465, + "rewards/kidney_reward/std": 2.048271656036377, + "rewards/length2tails_reward/mean": 0.6296185255050659, + "rewards/length2tails_reward/std": 0.333383709192276, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.5108214616775513, + "rewards/thermo_reward/std": 2.850874900817871, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.96875, + "completions/mean_terminated_length": 269.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09983132593333721, + "epoch": 0.408, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0952368900179863, + "learning_rate": 1.9808165379227195e-06, + "loss": 0.001, + "num_tokens": 1767001.0, + "reward": 9.538595199584961, + "reward_std": 8.022995948791504, + "rewards/fitness_reward/mean": 6.058516502380371, + "rewards/fitness_reward/std": 3.590620756149292, + "rewards/kidney_reward/mean": 1.707038164138794, + "rewards/kidney_reward/std": 2.0653762817382812, + "rewards/length2tails_reward/mean": 0.5839086174964905, + "rewards/length2tails_reward/std": 0.3778667449951172, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.6146492958068848, + "rewards/thermo_reward/std": 2.8776142597198486, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.03125, + "completions/mean_terminated_length": 269.03125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10124222468584776, + "epoch": 0.41, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0786767452955246, + "learning_rate": 1.98056577227522e-06, + "loss": 0.0004, + "num_tokens": 1775642.0, + "reward": 8.277677536010742, + "reward_std": 8.470403671264648, + "rewards/fitness_reward/mean": 5.537692070007324, + "rewards/fitness_reward/std": 3.9734997749328613, + "rewards/kidney_reward/mean": 1.4755680561065674, + "rewards/kidney_reward/std": 2.1754350662231445, + "rewards/length2tails_reward/mean": 0.4818817973136902, + "rewards/length2tails_reward/std": 0.4087766408920288, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.1162288188934326, + "rewards/thermo_reward/std": 2.740323781967163, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.875, + "completions/mean_terminated_length": 270.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09372684638947248, + "epoch": 0.412, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07380766421556473, + "learning_rate": 1.9803133943336874e-06, + "loss": -0.0034, + "num_tokens": 1784342.0, + "reward": 7.565323352813721, + "reward_std": 8.948416709899902, + "rewards/fitness_reward/mean": 4.959705352783203, + "rewards/fitness_reward/std": 4.494466304779053, + "rewards/kidney_reward/mean": 1.414535403251648, + "rewards/kidney_reward/std": 2.106023073196411, + "rewards/length2tails_reward/mean": 0.596523642539978, + "rewards/length2tails_reward/std": 0.43391671776771545, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 1.0376806259155273, + "rewards/thermo_reward/std": 2.8510472774505615, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.9375, + "completions/mean_terminated_length": 269.9375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10692477971315384, + "epoch": 0.414, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11711820214986801, + "learning_rate": 1.9800594045130928e-06, + "loss": 0.0029, + "num_tokens": 1793012.0, + "reward": 8.848175048828125, + "reward_std": 8.377004623413086, + "rewards/fitness_reward/mean": 5.87990140914917, + "rewards/fitness_reward/std": 3.820521593093872, + "rewards/kidney_reward/mean": 1.4009602069854736, + "rewards/kidney_reward/std": 2.222740888595581, + "rewards/length2tails_reward/mean": 0.5454018115997314, + "rewards/length2tails_reward/std": 0.3798547387123108, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.412773847579956, + "rewards/thermo_reward/std": 3.057025194168091, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.8125, + "completions/mean_terminated_length": 270.8125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11469333432614803, + "epoch": 0.416, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07612542062997818, + "learning_rate": 1.97980380323106e-06, + "loss": -0.0047, + "num_tokens": 1801710.0, + "reward": 7.182913780212402, + "reward_std": 9.170951843261719, + "rewards/fitness_reward/mean": 4.917283535003662, + "rewards/fitness_reward/std": 4.572504997253418, + "rewards/kidney_reward/mean": 1.20510733127594, + "rewards/kidney_reward/std": 2.1817445755004883, + "rewards/length2tails_reward/mean": 0.6531508564949036, + "rewards/length2tails_reward/std": 0.3536904454231262, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.895207941532135, + "rewards/thermo_reward/std": 2.9525623321533203, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.75, + "completions/mean_terminated_length": 270.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11213183123618364, + "epoch": 0.418, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07885367423295975, + "learning_rate": 1.97954659090786e-06, + "loss": -0.0038, + "num_tokens": 1810406.0, + "reward": 6.919522762298584, + "reward_std": 8.68114948272705, + "rewards/fitness_reward/mean": 4.821925163269043, + "rewards/fitness_reward/std": 4.653115272521973, + "rewards/kidney_reward/mean": 1.1035808324813843, + "rewards/kidney_reward/std": 2.052344560623169, + "rewards/length2tails_reward/mean": 0.632168173789978, + "rewards/length2tails_reward/std": 0.3854258358478546, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.8307993412017822, + "rewards/thermo_reward/std": 2.9898102283477783, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.34375, + "completions/mean_terminated_length": 269.34375, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "entropy": 0.10570395179092884, + "epoch": 0.42, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2664085030555725, + "learning_rate": 1.9792877679664147e-06, + "loss": -0.0205, + "num_tokens": 1819057.0, + "reward": 7.419079303741455, + "reward_std": 8.691427230834961, + "rewards/fitness_reward/mean": 5.043792247772217, + "rewards/fitness_reward/std": 4.39654541015625, + "rewards/kidney_reward/mean": 1.1797351837158203, + "rewards/kidney_reward/std": 2.1847445964813232, + "rewards/length2tails_reward/mean": 0.6589158773422241, + "rewards/length2tails_reward/std": 0.38090842962265015, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.0296605825424194, + "rewards/thermo_reward/std": 2.942087173461914, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 268.96875, + "completions/mean_terminated_length": 268.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.095816302113235, + "epoch": 0.422, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08715507388114929, + "learning_rate": 1.9790273348322927e-06, + "loss": 0.0019, + "num_tokens": 1827696.0, + "reward": 8.556151390075684, + "reward_std": 8.206366539001465, + "rewards/fitness_reward/mean": 5.665445327758789, + "rewards/fitness_reward/std": 4.011801719665527, + "rewards/kidney_reward/mean": 1.4945118427276611, + "rewards/kidney_reward/std": 1.9173368215560913, + "rewards/length2tails_reward/mean": 0.4852709472179413, + "rewards/length2tails_reward/std": 0.39102447032928467, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.2476671934127808, + "rewards/thermo_reward/std": 3.026831865310669, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.28125, + "completions/mean_terminated_length": 270.28125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10708794835954905, + "epoch": 0.424, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0768938884139061, + "learning_rate": 1.9787652919337115e-06, + "loss": -0.001, + "num_tokens": 1836377.0, + "reward": 11.526771545410156, + "reward_std": 5.237046718597412, + "rewards/fitness_reward/mean": 6.383553504943848, + "rewards/fitness_reward/std": 3.095924139022827, + "rewards/kidney_reward/mean": 2.3126468658447266, + "rewards/kidney_reward/std": 1.0282318592071533, + "rewards/length2tails_reward/mean": 0.6096439361572266, + "rewards/length2tails_reward/std": 0.3747730553150177, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.6696066856384277, + "rewards/thermo_reward/std": 2.085958957672119, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 266.75, + "completions/mean_terminated_length": 266.75, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.10727542918175459, + "epoch": 0.426, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28065893054008484, + "learning_rate": 1.9785016397015338e-06, + "loss": -0.0418, + "num_tokens": 1844945.0, + "reward": 6.286587715148926, + "reward_std": 9.712299346923828, + "rewards/fitness_reward/mean": 4.788792610168457, + "rewards/fitness_reward/std": 4.699509143829346, + "rewards/kidney_reward/mean": 0.8297417163848877, + "rewards/kidney_reward/std": 2.447777032852173, + "rewards/length2tails_reward/mean": 0.7117043733596802, + "rewards/length2tails_reward/std": 0.34659260511398315, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.4968832731246948, + "rewards/thermo_reward/std": 3.415356397628784, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.09375, + "completions/mean_terminated_length": 270.09375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10078845452517271, + "epoch": 0.428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08422592282295227, + "learning_rate": 1.9782363785692705e-06, + "loss": 0.0026, + "num_tokens": 1853620.0, + "reward": 6.68724250793457, + "reward_std": 8.57685661315918, + "rewards/fitness_reward/mean": 5.163091659545898, + "rewards/fitness_reward/std": 4.379825592041016, + "rewards/kidney_reward/mean": 1.1229581832885742, + "rewards/kidney_reward/std": 2.124249219894409, + "rewards/length2tails_reward/mean": 0.5803974866867065, + "rewards/length2tails_reward/std": 0.40740472078323364, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.24315303564071655, + "rewards/thermo_reward/std": 2.994739294052124, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.15625, + "completions/mean_terminated_length": 270.15625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10598285868763924, + "epoch": 0.43, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15869423747062683, + "learning_rate": 1.9779695089730764e-06, + "loss": -0.003, + "num_tokens": 1862297.0, + "reward": 8.356054306030273, + "reward_std": 9.389469146728516, + "rewards/fitness_reward/mean": 5.200437068939209, + "rewards/fitness_reward/std": 4.589191436767578, + "rewards/kidney_reward/mean": 1.5199666023254395, + "rewards/kidney_reward/std": 2.305187702178955, + "rewards/length2tails_reward/mean": 0.5944569110870361, + "rewards/length2tails_reward/std": 0.35584756731987, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.4762049913406372, + "rewards/thermo_reward/std": 2.8045899868011475, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.34375, + "completions/mean_terminated_length": 271.34375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.11291093192994595, + "epoch": 0.432, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.211589053273201, + "learning_rate": 1.9777010313517514e-06, + "loss": 0.0001, + "num_tokens": 1871012.0, + "reward": 9.170920372009277, + "reward_std": 7.176021575927734, + "rewards/fitness_reward/mean": 6.2553839683532715, + "rewards/fitness_reward/std": 3.310086250305176, + "rewards/kidney_reward/mean": 1.620316505432129, + "rewards/kidney_reward/std": 1.8193150758743286, + "rewards/length2tails_reward/mean": 0.6244357824325562, + "rewards/length2tails_reward/std": 0.370291143655777, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.1327762603759766, + "rewards/thermo_reward/std": 2.8469913005828857, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.375, + "completions/mean_terminated_length": 270.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09871920570731163, + "epoch": 0.434, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07975756376981735, + "learning_rate": 1.9774309461467397e-06, + "loss": -0.0064, + "num_tokens": 1879696.0, + "reward": 9.119773864746094, + "reward_std": 7.26805305480957, + "rewards/fitness_reward/mean": 5.79979133605957, + "rewards/fitness_reward/std": 3.581414222717285, + "rewards/kidney_reward/mean": 1.6331734657287598, + "rewards/kidney_reward/std": 1.749567985534668, + "rewards/length2tails_reward/mean": 0.5846536159515381, + "rewards/length2tails_reward/std": 0.39755120873451233, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.528343915939331, + "rewards/thermo_reward/std": 2.701770305633545, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.875, + "completions/mean_terminated_length": 269.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10855805594474077, + "epoch": 0.436, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.061504144221544266, + "learning_rate": 1.9771592538021285e-06, + "loss": 0.0001, + "num_tokens": 1888364.0, + "reward": 11.166021347045898, + "reward_std": 5.292967319488525, + "rewards/fitness_reward/mean": 6.575160503387451, + "rewards/fitness_reward/std": 2.870858669281006, + "rewards/kidney_reward/mean": 2.0742831230163574, + "rewards/kidney_reward/std": 1.3122246265411377, + "rewards/length2tails_reward/mean": 0.5768504738807678, + "rewards/length2tails_reward/std": 0.3807322382926941, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.3588924407958984, + "rewards/thermo_reward/std": 2.110995292663574, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 270.96875, + "completions/mean_terminated_length": 270.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09409010969102383, + "epoch": 0.438, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09661436080932617, + "learning_rate": 1.9768859547646473e-06, + "loss": 0.0003, + "num_tokens": 1897067.0, + "reward": 7.066924095153809, + "reward_std": 9.021658897399902, + "rewards/fitness_reward/mean": 5.256416320800781, + "rewards/fitness_reward/std": 4.2729172706604, + "rewards/kidney_reward/mean": 1.1841405630111694, + "rewards/kidney_reward/std": 2.26008939743042, + "rewards/length2tails_reward/mean": 0.5950348973274231, + "rewards/length2tails_reward/std": 0.4093562662601471, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.466863214969635, + "rewards/thermo_reward/std": 3.3025662899017334, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.75, + "completions/mean_terminated_length": 269.75, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10664527304470539, + "epoch": 0.44, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11443617194890976, + "learning_rate": 1.9766110494836685e-06, + "loss": -0.0004, + "num_tokens": 1905731.0, + "reward": 7.3894758224487305, + "reward_std": 8.807182312011719, + "rewards/fitness_reward/mean": 5.3923797607421875, + "rewards/fitness_reward/std": 4.229556560516357, + "rewards/kidney_reward/mean": 1.0697729587554932, + "rewards/kidney_reward/std": 2.2666690349578857, + "rewards/length2tails_reward/mean": 0.5529941916465759, + "rewards/length2tails_reward/std": 0.3819185793399811, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.7720230221748352, + "rewards/thermo_reward/std": 3.1068079471588135, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.5, + "completions/mean_terminated_length": 270.5, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11153780948370695, + "epoch": 0.442, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1167251318693161, + "learning_rate": 1.976334538411204e-06, + "loss": 0.0033, + "num_tokens": 1914419.0, + "reward": 7.049777030944824, + "reward_std": 9.890249252319336, + "rewards/fitness_reward/mean": 4.516417980194092, + "rewards/fitness_reward/std": 5.013523101806641, + "rewards/kidney_reward/mean": 1.3213751316070557, + "rewards/kidney_reward/std": 2.31786847114563, + "rewards/length2tails_reward/mean": 0.6002007126808167, + "rewards/length2tails_reward/std": 0.410892128944397, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.0519638061523438, + "rewards/thermo_reward/std": 3.2038497924804688, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.5, + "completions/mean_terminated_length": 270.5, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09985514171421528, + "epoch": 0.444, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06074235960841179, + "learning_rate": 1.9760564220019072e-06, + "loss": -0.0024, + "num_tokens": 1923107.0, + "reward": 8.840917587280273, + "reward_std": 8.136581420898438, + "rewards/fitness_reward/mean": 5.285778999328613, + "rewards/fitness_reward/std": 4.406919956207275, + "rewards/kidney_reward/mean": 1.6758543252944946, + "rewards/kidney_reward/std": 1.9091429710388184, + "rewards/length2tails_reward/mean": 0.5887254476547241, + "rewards/length2tails_reward/std": 0.41157737374305725, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 1.7266616821289062, + "rewards/thermo_reward/std": 2.6433470249176025, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 270.09375, + "completions/mean_terminated_length": 270.09375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10312576498836279, + "epoch": 0.446, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.060146745294332504, + "learning_rate": 1.9757767007130704e-06, + "loss": 0.0018, + "num_tokens": 1931782.0, + "reward": 9.458915710449219, + "reward_std": 7.045760631561279, + "rewards/fitness_reward/mean": 6.186161518096924, + "rewards/fitness_reward/std": 3.3415544033050537, + "rewards/kidney_reward/mean": 1.8518340587615967, + "rewards/kidney_reward/std": 1.664615273475647, + "rewards/length2tails_reward/mean": 0.5694063901901245, + "rewards/length2tails_reward/std": 0.3994041979312897, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.2639801502227783, + "rewards/thermo_reward/std": 2.8834218978881836, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 267.625, + "completions/mean_terminated_length": 267.625, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "entropy": 0.1007047207094729, + "epoch": 0.448, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12512126564979553, + "learning_rate": 1.9754953750046246e-06, + "loss": -0.0234, + "num_tokens": 1940378.0, + "reward": 8.606588363647461, + "reward_std": 8.08462905883789, + "rewards/fitness_reward/mean": 5.883886337280273, + "rewards/fitness_reward/std": 3.8256380558013916, + "rewards/kidney_reward/mean": 1.5120166540145874, + "rewards/kidney_reward/std": 2.1992740631103516, + "rewards/length2tails_reward/mean": 0.5627999901771545, + "rewards/length2tails_reward/std": 0.3526705205440521, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.0544053316116333, + "rewards/thermo_reward/std": 2.9035634994506836, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.46875, + "completions/mean_terminated_length": 269.46875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09678153228014708, + "epoch": 0.45, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18851271271705627, + "learning_rate": 1.9752124453391404e-06, + "loss": -0.0008, + "num_tokens": 1949033.0, + "reward": 6.986786842346191, + "reward_std": 8.987977027893066, + "rewards/fitness_reward/mean": 5.2845048904418945, + "rewards/fitness_reward/std": 4.396836280822754, + "rewards/kidney_reward/mean": 1.0947299003601074, + "rewards/kidney_reward/std": 2.2589919567108154, + "rewards/length2tails_reward/mean": 0.5387097597122192, + "rewards/length2tails_reward/std": 0.40030577778816223, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.45368072390556335, + "rewards/thermo_reward/std": 2.9777472019195557, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 270.65625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11522582359611988, + "epoch": 0.452, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0713803768157959, + "learning_rate": 1.9749279121818236e-06, + "loss": 0.0016, + "num_tokens": 1957726.0, + "reward": 11.161405563354492, + "reward_std": 7.142817974090576, + "rewards/fitness_reward/mean": 6.314478397369385, + "rewards/fitness_reward/std": 3.3092589378356934, + "rewards/kidney_reward/mean": 2.069758415222168, + "rewards/kidney_reward/std": 1.672605276107788, + "rewards/length2tails_reward/mean": 0.6350034475326538, + "rewards/length2tails_reward/std": 0.3509335219860077, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.6136674880981445, + "rewards/thermo_reward/std": 2.301509141921997, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.375, + "completions/mean_terminated_length": 270.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10247354488819838, + "epoch": 0.454, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08207889646291733, + "learning_rate": 1.9746417760005176e-06, + "loss": -0.0077, + "num_tokens": 1966410.0, + "reward": 11.169610023498535, + "reward_std": 4.311980724334717, + "rewards/fitness_reward/mean": 6.826416969299316, + "rewards/fitness_reward/std": 2.1413729190826416, + "rewards/kidney_reward/mean": 2.0623292922973633, + "rewards/kidney_reward/std": 1.2271078824996948, + "rewards/length2tails_reward/mean": 0.5567665100097656, + "rewards/length2tails_reward/std": 0.38493385910987854, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.125187873840332, + "rewards/thermo_reward/std": 1.8756372928619385, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.46875, + "completions/mean_terminated_length": 269.46875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10469500161707401, + "epoch": 0.456, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08091660588979721, + "learning_rate": 1.9743540372657017e-06, + "loss": -0.0067, + "num_tokens": 1975065.0, + "reward": 9.968902587890625, + "reward_std": 6.146872520446777, + "rewards/fitness_reward/mean": 5.6489458084106445, + "rewards/fitness_reward/std": 4.059178829193115, + "rewards/kidney_reward/mean": 2.179636001586914, + "rewards/kidney_reward/std": 1.0131422281265259, + "rewards/length2tails_reward/mean": 0.5025430917739868, + "rewards/length2tails_reward/std": 0.36392441391944885, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 1.9963165521621704, + "rewards/thermo_reward/std": 2.2949652671813965, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.03125, + "completions/mean_terminated_length": 272.03125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1013132524676621, + "epoch": 0.458, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2698652446269989, + "learning_rate": 1.97406469645049e-06, + "loss": 0.0001, + "num_tokens": 1983802.0, + "reward": 9.640462875366211, + "reward_std": 7.675972938537598, + "rewards/fitness_reward/mean": 5.40672492980957, + "rewards/fitness_reward/std": 4.173493385314941, + "rewards/kidney_reward/mean": 1.7126578092575073, + "rewards/kidney_reward/std": 1.7750372886657715, + "rewards/length2tails_reward/mean": 0.6748954057693481, + "rewards/length2tails_reward/std": 0.35597914457321167, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.353590965270996, + "rewards/thermo_reward/std": 2.550793409347534, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.28125, + "completions/mean_terminated_length": 270.28125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1011765506118536, + "epoch": 0.46, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1072259470820427, + "learning_rate": 1.9737737540306303e-06, + "loss": -0.007, + "num_tokens": 1992483.0, + "reward": 9.985023498535156, + "reward_std": 6.017777919769287, + "rewards/fitness_reward/mean": 6.292191982269287, + "rewards/fitness_reward/std": 3.01308536529541, + "rewards/kidney_reward/mean": 1.845158576965332, + "rewards/kidney_reward/std": 1.4613128900527954, + "rewards/length2tails_reward/mean": 0.5501976013183594, + "rewards/length2tails_reward/std": 0.4069270193576813, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.692654013633728, + "rewards/thermo_reward/std": 2.388899087905884, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.125, + "completions/mean_terminated_length": 271.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09331366792321205, + "epoch": 0.462, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06806115806102753, + "learning_rate": 1.9734812104845046e-06, + "loss": 0.0001, + "num_tokens": 2001191.0, + "reward": 8.523029327392578, + "reward_std": 9.589752197265625, + "rewards/fitness_reward/mean": 4.993949890136719, + "rewards/fitness_reward/std": 4.623663425445557, + "rewards/kidney_reward/mean": 1.4517195224761963, + "rewards/kidney_reward/std": 2.2691798210144043, + "rewards/length2tails_reward/mean": 0.6628376245498657, + "rewards/length2tails_reward/std": 0.3733203411102295, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.9110764265060425, + "rewards/thermo_reward/std": 2.9277281761169434, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.375, + "completions/mean_terminated_length": 270.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10501847136765718, + "epoch": 0.464, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06102055683732033, + "learning_rate": 1.9731870662931275e-06, + "loss": -0.0046, + "num_tokens": 2009875.0, + "reward": 10.10276985168457, + "reward_std": 5.667848587036133, + "rewards/fitness_reward/mean": 6.6926422119140625, + "rewards/fitness_reward/std": 2.6392531394958496, + "rewards/kidney_reward/mean": 1.8372772932052612, + "rewards/kidney_reward/std": 1.5682625770568848, + "rewards/length2tails_reward/mean": 0.5817334651947021, + "rewards/length2tails_reward/std": 0.3835720121860504, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.4146769046783447, + "rewards/thermo_reward/std": 2.468012571334839, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.34375, + "completions/mean_terminated_length": 270.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10510055907070637, + "epoch": 0.466, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09905203431844711, + "learning_rate": 1.9728913219401447e-06, + "loss": -0.0012, + "num_tokens": 2018558.0, + "reward": 8.83747673034668, + "reward_std": 7.369777202606201, + "rewards/fitness_reward/mean": 5.248540878295898, + "rewards/fitness_reward/std": 4.345114707946777, + "rewards/kidney_reward/mean": 1.7455861568450928, + "rewards/kidney_reward/std": 1.5706603527069092, + "rewards/length2tails_reward/mean": 0.5550670623779297, + "rewards/length2tails_reward/std": 0.39944586157798767, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.6878437995910645, + "rewards/thermo_reward/std": 2.5179495811462402, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.75, + "completions/mean_terminated_length": 269.75, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08783813379704952, + "epoch": 0.468, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08770688623189926, + "learning_rate": 1.972593977911834e-06, + "loss": 0.0015, + "num_tokens": 2027222.0, + "reward": 5.320065021514893, + "reward_std": 9.834723472595215, + "rewards/fitness_reward/mean": 3.948270082473755, + "rewards/fitness_reward/std": 5.157715320587158, + "rewards/kidney_reward/mean": 0.9420422911643982, + "rewards/kidney_reward/std": 2.3843331336975098, + "rewards/length2tails_reward/mean": 0.5566017627716064, + "rewards/length2tails_reward/std": 0.38987359404563904, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.274092435836792, + "rewards/thermo_reward/std": 3.1555631160736084, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.65625, + "completions/mean_terminated_length": 271.65625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1020322423428297, + "epoch": 0.47, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04721997305750847, + "learning_rate": 1.9722950346971028e-06, + "loss": -0.0039, + "num_tokens": 2035947.0, + "reward": 10.875839233398438, + "reward_std": 6.017666816711426, + "rewards/fitness_reward/mean": 6.333611488342285, + "rewards/fitness_reward/std": 3.2550642490386963, + "rewards/kidney_reward/mean": 2.1162962913513184, + "rewards/kidney_reward/std": 1.250596046447754, + "rewards/length2tails_reward/mean": 0.6817765235900879, + "rewards/length2tails_reward/std": 0.36646783351898193, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.25775408744812, + "rewards/thermo_reward/std": 2.3996896743774414, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.625, + "completions/mean_terminated_length": 270.625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11050799861550331, + "epoch": 0.472, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07714132219552994, + "learning_rate": 1.971994492787488e-06, + "loss": -0.0022, + "num_tokens": 2044639.0, + "reward": 9.818836212158203, + "reward_std": 6.177550792694092, + "rewards/fitness_reward/mean": 6.414205074310303, + "rewards/fitness_reward/std": 2.6555421352386475, + "rewards/kidney_reward/mean": 1.8081889152526855, + "rewards/kidney_reward/std": 1.7323895692825317, + "rewards/length2tails_reward/mean": 0.5811498165130615, + "rewards/length2tails_reward/std": 0.3792761266231537, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.438327431678772, + "rewards/thermo_reward/std": 2.689751148223877, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 264.25, + "completions/mean_terminated_length": 264.25, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.12198043707758188, + "epoch": 0.474, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.40211084485054016, + "learning_rate": 1.971692352677155e-06, + "loss": -0.1007, + "num_tokens": 2053127.0, + "reward": 10.964704513549805, + "reward_std": 6.078287124633789, + "rewards/fitness_reward/mean": 6.610565662384033, + "rewards/fitness_reward/std": 2.9550764560699463, + "rewards/kidney_reward/mean": 2.0635910034179688, + "rewards/kidney_reward/std": 1.5378971099853516, + "rewards/length2tails_reward/mean": 0.6503303050994873, + "rewards/length2tails_reward/std": 0.3064909875392914, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.1255152225494385, + "rewards/thermo_reward/std": 2.5424704551696777, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.34375, + "completions/mean_terminated_length": 272.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1105615459382534, + "epoch": 0.476, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08003703504800797, + "learning_rate": 1.9713886148628977e-06, + "loss": -0.0059, + "num_tokens": 2061874.0, + "reward": 9.830589294433594, + "reward_std": 7.20787239074707, + "rewards/fitness_reward/mean": 5.873849868774414, + "rewards/fitness_reward/std": 3.709421157836914, + "rewards/kidney_reward/mean": 1.8575446605682373, + "rewards/kidney_reward/std": 1.6075026988983154, + "rewards/length2tails_reward/mean": 0.7331730127334595, + "rewards/length2tails_reward/std": 0.33084240555763245, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.9258770942687988, + "rewards/thermo_reward/std": 2.7858848571777344, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.90625, + "completions/mean_terminated_length": 269.90625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.07857981137931347, + "epoch": 0.478, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1605258733034134, + "learning_rate": 1.971083279844136e-06, + "loss": -0.0034, + "num_tokens": 2070543.0, + "reward": 5.80064582824707, + "reward_std": 9.10144329071045, + "rewards/fitness_reward/mean": 4.05497932434082, + "rewards/fitness_reward/std": 5.024665355682373, + "rewards/kidney_reward/mean": 0.9805687069892883, + "rewards/kidney_reward/std": 2.246600866317749, + "rewards/length2tails_reward/mean": 0.5228518843650818, + "rewards/length2tails_reward/std": 0.40586429834365845, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.6128125190734863, + "rewards/thermo_reward/std": 3.0637567043304443, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.1875, + "completions/mean_terminated_length": 271.1875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1058599092066288, + "epoch": 0.48, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07374675571918488, + "learning_rate": 1.970776348122918e-06, + "loss": -0.0001, + "num_tokens": 2079253.0, + "reward": 8.957300186157227, + "reward_std": 8.268644332885742, + "rewards/fitness_reward/mean": 5.5188822746276855, + "rewards/fitness_reward/std": 4.086907386779785, + "rewards/kidney_reward/mean": 1.638132929801941, + "rewards/kidney_reward/std": 1.9194594621658325, + "rewards/length2tails_reward/mean": 0.686102569103241, + "rewards/length2tails_reward/std": 0.33712905645370483, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.631675124168396, + "rewards/thermo_reward/std": 2.6473958492279053, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 268.9375, + "completions/mean_terminated_length": 268.9375, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "entropy": 0.09808468306437135, + "epoch": 0.482, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10668472200632095, + "learning_rate": 1.9704678202039145e-06, + "loss": -0.0018, + "num_tokens": 2087891.0, + "reward": 8.597445487976074, + "reward_std": 8.43989372253418, + "rewards/fitness_reward/mean": 5.264682769775391, + "rewards/fitness_reward/std": 4.307192802429199, + "rewards/kidney_reward/mean": 1.4983744621276855, + "rewards/kidney_reward/std": 1.901678442955017, + "rewards/length2tails_reward/mean": 0.612869381904602, + "rewards/length2tails_reward/std": 0.3913209140300751, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.6731013059616089, + "rewards/thermo_reward/std": 2.823862314224243, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.09375, + "completions/mean_terminated_length": 269.09375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09199962671846151, + "epoch": 0.484, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11633557081222534, + "learning_rate": 1.970157696594423e-06, + "loss": -0.0002, + "num_tokens": 2096534.0, + "reward": 7.774277687072754, + "reward_std": 8.003270149230957, + "rewards/fitness_reward/mean": 5.301121711730957, + "rewards/fitness_reward/std": 4.235241413116455, + "rewards/kidney_reward/mean": 1.5189933776855469, + "rewards/kidney_reward/std": 1.8085318803787231, + "rewards/length2tails_reward/mean": 0.4761810600757599, + "rewards/length2tails_reward/std": 0.3997570872306824, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 0.8127946257591248, + "rewards/thermo_reward/std": 2.9532229900360107, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.75, + "completions/mean_terminated_length": 270.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10292203165590763, + "epoch": 0.486, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16641299426555634, + "learning_rate": 1.9698459778043624e-06, + "loss": 0.0031, + "num_tokens": 2105230.0, + "reward": 7.040865898132324, + "reward_std": 9.38436508178711, + "rewards/fitness_reward/mean": 5.185315132141113, + "rewards/fitness_reward/std": 4.608910083770752, + "rewards/kidney_reward/mean": 1.0840166807174683, + "rewards/kidney_reward/std": 2.2991762161254883, + "rewards/length2tails_reward/mean": 0.634776771068573, + "rewards/length2tails_reward/std": 0.3302549719810486, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.6080567836761475, + "rewards/thermo_reward/std": 3.2210071086883545, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 270.46875, + "completions/mean_terminated_length": 270.46875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11193057429045439, + "epoch": 0.488, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09709324687719345, + "learning_rate": 1.9695326643462775e-06, + "loss": -0.0052, + "num_tokens": 2113917.0, + "reward": 10.90302848815918, + "reward_std": 4.896656036376953, + "rewards/fitness_reward/mean": 6.579126358032227, + "rewards/fitness_reward/std": 2.6416797637939453, + "rewards/kidney_reward/mean": 2.070417881011963, + "rewards/kidney_reward/std": 1.136030912399292, + "rewards/length2tails_reward/mean": 0.5819152593612671, + "rewards/length2tails_reward/std": 0.390530526638031, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.0952932834625244, + "rewards/thermo_reward/std": 2.45220685005188, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.03125, + "completions/mean_terminated_length": 271.03125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10169417830184102, + "epoch": 0.49, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08554291725158691, + "learning_rate": 1.9692177567353328e-06, + "loss": -0.0042, + "num_tokens": 2122622.0, + "reward": 9.377301216125488, + "reward_std": 7.241844654083252, + "rewards/fitness_reward/mean": 6.431892395019531, + "rewards/fitness_reward/std": 3.0229907035827637, + "rewards/kidney_reward/mean": 1.6262295246124268, + "rewards/kidney_reward/std": 2.0052170753479004, + "rewards/length2tails_reward/mean": 0.6268943548202515, + "rewards/length2tails_reward/std": 0.38420239090919495, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.156490683555603, + "rewards/thermo_reward/std": 3.046543836593628, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.3125, + "completions/mean_terminated_length": 270.3125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11073481850326061, + "epoch": 0.492, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18680819869041443, + "learning_rate": 1.968901255489315e-06, + "loss": -0.0029, + "num_tokens": 2131304.0, + "reward": 10.530344009399414, + "reward_std": 6.960169315338135, + "rewards/fitness_reward/mean": 5.975531578063965, + "rewards/fitness_reward/std": 3.5728461742401123, + "rewards/kidney_reward/mean": 1.9520188570022583, + "rewards/kidney_reward/std": 1.632800579071045, + "rewards/length2tails_reward/mean": 0.6160197257995605, + "rewards/length2tails_reward/std": 0.35398852825164795, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.4411919116973877, + "rewards/thermo_reward/std": 2.3167166709899902, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 270.5, + "completions/mean_terminated_length": 270.5, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11004836764186621, + "epoch": 0.494, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11552101373672485, + "learning_rate": 1.968583161128631e-06, + "loss": 0.0027, + "num_tokens": 2139992.0, + "reward": 11.550786018371582, + "reward_std": 4.9059977531433105, + "rewards/fitness_reward/mean": 6.833098411560059, + "rewards/fitness_reward/std": 2.1057968139648438, + "rewards/kidney_reward/mean": 2.2125933170318604, + "rewards/kidney_reward/std": 1.2264693975448608, + "rewards/length2tails_reward/mean": 0.6199765205383301, + "rewards/length2tails_reward/std": 0.37857866287231445, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.343096971511841, + "rewards/thermo_reward/std": 2.2181098461151123, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10392105765640736, + "epoch": 0.496, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.068658247590065, + "learning_rate": 1.9682634741763067e-06, + "loss": -0.0049, + "num_tokens": 2148708.0, + "reward": 10.745346069335938, + "reward_std": 5.804399490356445, + "rewards/fitness_reward/mean": 6.5791473388671875, + "rewards/fitness_reward/std": 2.680927038192749, + "rewards/kidney_reward/mean": 1.9059796333312988, + "rewards/kidney_reward/std": 1.4528141021728516, + "rewards/length2tails_reward/mean": 0.6526817679405212, + "rewards/length2tails_reward/std": 0.37751561403274536, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.0949506759643555, + "rewards/thermo_reward/std": 2.1236202716827393, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.5, + "completions/mean_terminated_length": 270.5, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11237195134162903, + "epoch": 0.498, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0876859650015831, + "learning_rate": 1.967942195157987e-06, + "loss": 0.0018, + "num_tokens": 2157396.0, + "reward": 9.161323547363281, + "reward_std": 7.447658061981201, + "rewards/fitness_reward/mean": 5.935283660888672, + "rewards/fitness_reward/std": 3.6782286167144775, + "rewards/kidney_reward/mean": 1.6700785160064697, + "rewards/kidney_reward/std": 1.729300856590271, + "rewards/length2tails_reward/mean": 0.5815059542655945, + "rewards/length2tails_reward/std": 0.3755898177623749, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.3978110551834106, + "rewards/thermo_reward/std": 2.6698153018951416, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.34375, + "completions/mean_terminated_length": 271.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10672505851835012, + "epoch": 0.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07486039400100708, + "learning_rate": 1.967619324601935e-06, + "loss": 0.0024, + "num_tokens": 2166111.0, + "reward": 9.086592674255371, + "reward_std": 8.031876564025879, + "rewards/fitness_reward/mean": 5.875249862670898, + "rewards/fitness_reward/std": 3.831639289855957, + "rewards/kidney_reward/mean": 1.4601225852966309, + "rewards/kidney_reward/std": 2.075751781463623, + "rewards/length2tails_reward/mean": 0.6911107897758484, + "rewards/length2tails_reward/std": 0.34283214807510376, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.582108974456787, + "rewards/thermo_reward/std": 2.710895299911499, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 271.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10726206749677658, + "epoch": 0.502, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1551721841096878, + "learning_rate": 1.9672948630390295e-06, + "loss": -0.0015, + "num_tokens": 2174822.0, + "reward": 9.408416748046875, + "reward_std": 8.032960891723633, + "rewards/fitness_reward/mean": 5.81867790222168, + "rewards/fitness_reward/std": 3.987089157104492, + "rewards/kidney_reward/mean": 1.5778536796569824, + "rewards/kidney_reward/std": 1.9939128160476685, + "rewards/length2tails_reward/mean": 0.636410117149353, + "rewards/length2tails_reward/std": 0.3963748514652252, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.8482444286346436, + "rewards/thermo_reward/std": 2.960555076599121, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.96875, + "completions/mean_terminated_length": 269.96875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.0918376175686717, + "epoch": 0.504, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06933335214853287, + "learning_rate": 1.9669688110027664e-06, + "loss": -0.0004, + "num_tokens": 2183493.0, + "reward": 9.228353500366211, + "reward_std": 8.074507713317871, + "rewards/fitness_reward/mean": 5.570281028747559, + "rewards/fitness_reward/std": 4.243092060089111, + "rewards/kidney_reward/mean": 1.7595927715301514, + "rewards/kidney_reward/std": 1.8972914218902588, + "rewards/length2tails_reward/mean": 0.5458166003227234, + "rewards/length2tails_reward/std": 0.379034161567688, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.7438974380493164, + "rewards/thermo_reward/std": 2.649005651473999, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.34375, + "completions/mean_terminated_length": 271.34375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10837728530168533, + "epoch": 0.506, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10028190910816193, + "learning_rate": 1.966641169029256e-06, + "loss": -0.0051, + "num_tokens": 2192208.0, + "reward": 8.974296569824219, + "reward_std": 7.428702354431152, + "rewards/fitness_reward/mean": 5.241325855255127, + "rewards/fitness_reward/std": 4.222070693969727, + "rewards/kidney_reward/mean": 1.8032042980194092, + "rewards/kidney_reward/std": 1.6074918508529663, + "rewards/length2tails_reward/mean": 0.643540620803833, + "rewards/length2tails_reward/std": 0.3968313932418823, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 1.7716628313064575, + "rewards/thermo_reward/std": 2.5006768703460693, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.5625, + "completions/mean_terminated_length": 270.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0990586718544364, + "epoch": 0.508, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08781884610652924, + "learning_rate": 1.966311937657224e-06, + "loss": -0.0021, + "num_tokens": 2200898.0, + "reward": 10.355709075927734, + "reward_std": 5.7248406410217285, + "rewards/fitness_reward/mean": 6.132905006408691, + "rewards/fitness_reward/std": 3.5404434204101562, + "rewards/kidney_reward/mean": 2.206814765930176, + "rewards/kidney_reward/std": 1.017005443572998, + "rewards/length2tails_reward/mean": 0.5586007833480835, + "rewards/length2tails_reward/std": 0.4141377806663513, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.8601289987564087, + "rewards/thermo_reward/std": 2.4966318607330322, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.21875, + "completions/mean_terminated_length": 270.21875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10989784728735685, + "epoch": 0.51, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3989095687866211, + "learning_rate": 1.9659811174280078e-06, + "loss": 0.002, + "num_tokens": 2209577.0, + "reward": 9.617646217346191, + "reward_std": 7.460577964782715, + "rewards/fitness_reward/mean": 5.974531650543213, + "rewards/fitness_reward/std": 3.737210273742676, + "rewards/kidney_reward/mean": 1.925750732421875, + "rewards/kidney_reward/std": 1.7319450378417969, + "rewards/length2tails_reward/mean": 0.5906080603599548, + "rewards/length2tails_reward/std": 0.3697930872440338, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.558302640914917, + "rewards/thermo_reward/std": 2.6359283924102783, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.96875, + "completions/mean_terminated_length": 270.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10572863556444645, + "epoch": 0.512, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06452004611492157, + "learning_rate": 1.965648708885559e-06, + "loss": -0.0064, + "num_tokens": 2218280.0, + "reward": 12.625532150268555, + "reward_std": 3.140712261199951, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.4296531677246094, + "rewards/kidney_reward/std": 0.5578335523605347, + "rewards/length2tails_reward/mean": 0.6430323123931885, + "rewards/length2tails_reward/std": 0.36871227622032166, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.978522300720215, + "rewards/thermo_reward/std": 1.303676962852478, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.5625, + "completions/mean_terminated_length": 271.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09694924391806126, + "epoch": 0.514, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.052169837057590485, + "learning_rate": 1.965314712576439e-06, + "loss": -0.0001, + "num_tokens": 2227002.0, + "reward": 11.298360824584961, + "reward_std": 5.958661079406738, + "rewards/fitness_reward/mean": 6.431214332580566, + "rewards/fitness_reward/std": 2.8412742614746094, + "rewards/kidney_reward/mean": 2.123246192932129, + "rewards/kidney_reward/std": 1.3759034872055054, + "rewards/length2tails_reward/mean": 0.6636830568313599, + "rewards/length2tails_reward/std": 0.38713860511779785, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.5775318145751953, + "rewards/thermo_reward/std": 1.9351935386657715, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.125, + "completions/mean_terminated_length": 271.125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09880248364061117, + "epoch": 0.516, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06404350697994232, + "learning_rate": 1.964979129049821e-06, + "loss": -0.0011, + "num_tokens": 2235710.0, + "reward": 8.78803825378418, + "reward_std": 7.820840358734131, + "rewards/fitness_reward/mean": 5.565892219543457, + "rewards/fitness_reward/std": 3.981142282485962, + "rewards/kidney_reward/mean": 1.5922739505767822, + "rewards/kidney_reward/std": 1.8315236568450928, + "rewards/length2tails_reward/mean": 0.6426012516021729, + "rewards/length2tails_reward/std": 0.40327534079551697, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 1.471862554550171, + "rewards/thermo_reward/std": 2.617784261703491, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 271.21875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.11636047996580601, + "epoch": 0.518, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07931548357009888, + "learning_rate": 1.964641958857489e-06, + "loss": -0.0016, + "num_tokens": 2244421.0, + "reward": 8.441205978393555, + "reward_std": 8.229249954223633, + "rewards/fitness_reward/mean": 5.807088851928711, + "rewards/fitness_reward/std": 3.8794262409210205, + "rewards/kidney_reward/mean": 1.409515380859375, + "rewards/kidney_reward/std": 2.178295850753784, + "rewards/length2tails_reward/mean": 0.6650169491767883, + "rewards/length2tails_reward/std": 0.35845041275024414, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.0580998659133911, + "rewards/thermo_reward/std": 3.0075619220733643, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.25, + "completions/mean_terminated_length": 270.25, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10207339003682137, + "epoch": 0.52, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1335216462612152, + "learning_rate": 1.964303202553833e-06, + "loss": 0.0047, + "num_tokens": 2253101.0, + "reward": 7.660137176513672, + "reward_std": 9.0372953414917, + "rewards/fitness_reward/mean": 5.297764778137207, + "rewards/fitness_reward/std": 4.39350700378418, + "rewards/kidney_reward/mean": 1.3012959957122803, + "rewards/kidney_reward/std": 2.284921169281006, + "rewards/length2tails_reward/mean": 0.5699120759963989, + "rewards/length2tails_reward/std": 0.3861338198184967, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.9040855169296265, + "rewards/thermo_reward/std": 3.124284029006958, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.71875, + "completions/mean_terminated_length": 270.71875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0989492554217577, + "epoch": 0.522, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08350550383329391, + "learning_rate": 1.9639628606958534e-06, + "loss": -0.0036, + "num_tokens": 2261796.0, + "reward": 9.212039947509766, + "reward_std": 7.971827030181885, + "rewards/fitness_reward/mean": 5.565590858459473, + "rewards/fitness_reward/std": 4.099094390869141, + "rewards/kidney_reward/mean": 1.6808946132659912, + "rewards/kidney_reward/std": 1.801498293876648, + "rewards/length2tails_reward/mean": 0.6365346312522888, + "rewards/length2tails_reward/std": 0.3803783357143402, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.8019015789031982, + "rewards/thermo_reward/std": 2.71297025680542, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 267.46875, + "completions/mean_terminated_length": 267.46875, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "entropy": 0.10223742201924324, + "epoch": 0.524, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2912435829639435, + "learning_rate": 1.9636209338431567e-06, + "loss": -0.0394, + "num_tokens": 2270387.0, + "reward": 10.56210994720459, + "reward_std": 7.012156009674072, + "rewards/fitness_reward/mean": 6.2607808113098145, + "rewards/fitness_reward/std": 3.311769485473633, + "rewards/kidney_reward/mean": 1.8473119735717773, + "rewards/kidney_reward/std": 1.8399372100830078, + "rewards/length2tails_reward/mean": 0.5963507890701294, + "rewards/length2tails_reward/std": 0.3624004125595093, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.294381618499756, + "rewards/thermo_reward/std": 2.297987461090088, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.875, + "completions/mean_terminated_length": 270.875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1105208182707429, + "epoch": 0.526, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07265937328338623, + "learning_rate": 1.963277422557956e-06, + "loss": 0.003, + "num_tokens": 2279087.0, + "reward": 10.239557266235352, + "reward_std": 7.107791900634766, + "rewards/fitness_reward/mean": 6.289301872253418, + "rewards/fitness_reward/std": 3.386221170425415, + "rewards/kidney_reward/mean": 1.9778251647949219, + "rewards/kidney_reward/std": 1.6936469078063965, + "rewards/length2tails_reward/mean": 0.6477591395378113, + "rewards/length2tails_reward/std": 0.37626996636390686, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.8076553344726562, + "rewards/thermo_reward/std": 2.515333652496338, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 270.75, + "completions/mean_terminated_length": 270.75, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10514908283948898, + "epoch": 0.528, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05402668938040733, + "learning_rate": 1.962932327405069e-06, + "loss": 0.0008, + "num_tokens": 2287783.0, + "reward": 10.216456413269043, + "reward_std": 7.160484790802002, + "rewards/fitness_reward/mean": 6.291948318481445, + "rewards/fitness_reward/std": 3.37766695022583, + "rewards/kidney_reward/mean": 1.8252959251403809, + "rewards/kidney_reward/std": 1.7469253540039062, + "rewards/length2tails_reward/mean": 0.6056156158447266, + "rewards/length2tails_reward/std": 0.4104541540145874, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.938650131225586, + "rewards/thermo_reward/std": 2.5619373321533203, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.5625, + "completions/mean_terminated_length": 269.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09763561934232712, + "epoch": 0.53, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06480225920677185, + "learning_rate": 1.962585648951918e-06, + "loss": 0.0019, + "num_tokens": 2296441.0, + "reward": 10.411375045776367, + "reward_std": 6.475214004516602, + "rewards/fitness_reward/mean": 6.272033214569092, + "rewards/fitness_reward/std": 3.441664695739746, + "rewards/kidney_reward/mean": 2.0811996459960938, + "rewards/kidney_reward/std": 1.392430067062378, + "rewards/length2tails_reward/mean": 0.5402275919914246, + "rewards/length2tails_reward/std": 0.3795466125011444, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.904119849205017, + "rewards/thermo_reward/std": 2.7143473625183105, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.125, + "completions/mean_terminated_length": 272.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1107251700013876, + "epoch": 0.532, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08268602937459946, + "learning_rate": 1.962237387768529e-06, + "loss": -0.0002, + "num_tokens": 2305181.0, + "reward": 10.285980224609375, + "reward_std": 7.013757228851318, + "rewards/fitness_reward/mean": 6.299373626708984, + "rewards/fitness_reward/std": 3.354177713394165, + "rewards/kidney_reward/mean": 2.0277373790740967, + "rewards/kidney_reward/std": 1.6260619163513184, + "rewards/length2tails_reward/mean": 0.7632877230644226, + "rewards/length2tails_reward/std": 0.2882511615753174, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.7825400829315186, + "rewards/thermo_reward/std": 2.7083804607391357, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.34375, + "completions/mean_terminated_length": 270.34375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10624216496944427, + "epoch": 0.534, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07529258728027344, + "learning_rate": 1.9618875444275292e-06, + "loss": -0.0061, + "num_tokens": 2313864.0, + "reward": 10.298490524291992, + "reward_std": 5.719725131988525, + "rewards/fitness_reward/mean": 5.982329368591309, + "rewards/fitness_reward/std": 3.718473196029663, + "rewards/kidney_reward/mean": 2.1536967754364014, + "rewards/kidney_reward/std": 1.1052112579345703, + "rewards/length2tails_reward/mean": 0.57701575756073, + "rewards/length2tails_reward/std": 0.3976078927516937, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.0047621726989746, + "rewards/thermo_reward/std": 2.1830263137817383, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.09375, + "completions/mean_terminated_length": 271.09375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1106356205418706, + "epoch": 0.536, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07455572485923767, + "learning_rate": 1.961536119504149e-06, + "loss": -0.0065, + "num_tokens": 2322571.0, + "reward": 11.61314868927002, + "reward_std": 3.800121784210205, + "rewards/fitness_reward/mean": 6.880525588989258, + "rewards/fitness_reward/std": 1.8558024168014526, + "rewards/kidney_reward/mean": 2.280116558074951, + "rewards/kidney_reward/std": 0.8632156848907471, + "rewards/length2tails_reward/mean": 0.6226137280464172, + "rewards/length2tails_reward/std": 0.3921652138233185, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.290245294570923, + "rewards/thermo_reward/std": 1.98012113571167, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.8125, + "completions/mean_terminated_length": 271.8125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11541940923780203, + "epoch": 0.538, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0967407152056694, + "learning_rate": 1.9611831135762175e-06, + "loss": -0.0032, + "num_tokens": 2331301.0, + "reward": 8.631363868713379, + "reward_std": 8.920723915100098, + "rewards/fitness_reward/mean": 5.311345100402832, + "rewards/fitness_reward/std": 4.344130516052246, + "rewards/kidney_reward/mean": 1.4954752922058105, + "rewards/kidney_reward/std": 2.1614737510681152, + "rewards/length2tails_reward/mean": 0.7488479614257812, + "rewards/length2tails_reward/std": 0.29961255192756653, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.6496593952178955, + "rewards/thermo_reward/std": 2.8214712142944336, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.40625, + "completions/mean_terminated_length": 270.40625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1007602158933878, + "epoch": 0.54, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06450812518596649, + "learning_rate": 1.960828527224165e-06, + "loss": 0.0034, + "num_tokens": 2339986.0, + "reward": 10.680625915527344, + "reward_std": 7.085577011108398, + "rewards/fitness_reward/mean": 6.308061122894287, + "rewards/fitness_reward/std": 3.326788902282715, + "rewards/kidney_reward/mean": 1.973847508430481, + "rewards/kidney_reward/std": 1.7045009136199951, + "rewards/length2tails_reward/mean": 0.6022521257400513, + "rewards/length2tails_reward/std": 0.36868545413017273, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.238492488861084, + "rewards/thermo_reward/std": 2.4997081756591797, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 270.4375, + "completions/mean_terminated_length": 270.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10899223946034908, + "epoch": 0.542, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07492761313915253, + "learning_rate": 1.9604723610310193e-06, + "loss": -0.0014, + "num_tokens": 2348672.0, + "reward": 9.707115173339844, + "reward_std": 6.792953014373779, + "rewards/fitness_reward/mean": 6.241734504699707, + "rewards/fitness_reward/std": 3.173224449157715, + "rewards/kidney_reward/mean": 1.650956630706787, + "rewards/kidney_reward/std": 1.7409296035766602, + "rewards/length2tails_reward/mean": 0.6261404752731323, + "rewards/length2tails_reward/std": 0.36519333720207214, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.6518101692199707, + "rewards/thermo_reward/std": 2.6973798274993896, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.0, + "completions/mean_terminated_length": 271.0, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10372630972415209, + "epoch": 0.544, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04830276221036911, + "learning_rate": 1.960114615582406e-06, + "loss": -0.0031, + "num_tokens": 2357376.0, + "reward": 10.46242904663086, + "reward_std": 6.330596446990967, + "rewards/fitness_reward/mean": 6.598143100738525, + "rewards/fitness_reward/std": 2.7803759574890137, + "rewards/kidney_reward/mean": 1.8887929916381836, + "rewards/kidney_reward/std": 1.5851860046386719, + "rewards/length2tails_reward/mean": 0.6319226026535034, + "rewards/length2tails_reward/std": 0.39739856123924255, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.812300205230713, + "rewards/thermo_reward/std": 2.750009298324585, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 271.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10615959856659174, + "epoch": 0.546, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12400078773498535, + "learning_rate": 1.959755291466548e-06, + "loss": -0.0009, + "num_tokens": 2366094.0, + "reward": 9.208582878112793, + "reward_std": 7.6028733253479, + "rewards/fitness_reward/mean": 6.126691818237305, + "rewards/fitness_reward/std": 3.529179096221924, + "rewards/kidney_reward/mean": 1.646256446838379, + "rewards/kidney_reward/std": 1.9871598482131958, + "rewards/length2tails_reward/mean": 0.6565751433372498, + "rewards/length2tails_reward/std": 0.37000566720962524, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.2699763774871826, + "rewards/thermo_reward/std": 2.9453539848327637, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 264.5625, + "completions/mean_terminated_length": 264.5625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.09374482091516256, + "epoch": 0.548, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28227949142456055, + "learning_rate": 1.959394389274264e-06, + "loss": -0.0441, + "num_tokens": 2374592.0, + "reward": 6.746344566345215, + "reward_std": 10.040284156799316, + "rewards/fitness_reward/mean": 4.243337631225586, + "rewards/fitness_reward/std": 5.081197261810303, + "rewards/kidney_reward/mean": 1.1457996368408203, + "rewards/kidney_reward/std": 2.2898268699645996, + "rewards/length2tails_reward/mean": 0.6021537184715271, + "rewards/length2tails_reward/std": 0.4084654748439789, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 1.2032418251037598, + "rewards/thermo_reward/std": 2.993588924407959, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.0625, + "completions/mean_terminated_length": 270.0625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08829090185463428, + "epoch": 0.55, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07252117991447449, + "learning_rate": 1.9590319095989656e-06, + "loss": -0.0062, + "num_tokens": 2383266.0, + "reward": 7.876636505126953, + "reward_std": 8.928849220275879, + "rewards/fitness_reward/mean": 4.758971691131592, + "rewards/fitness_reward/std": 4.770308494567871, + "rewards/kidney_reward/mean": 1.543241262435913, + "rewards/kidney_reward/std": 1.9612581729888916, + "rewards/length2tails_reward/mean": 0.5653245449066162, + "rewards/length2tails_reward/std": 0.38337770104408264, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.4178907871246338, + "rewards/thermo_reward/std": 2.895482063293457, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 265.03125, + "completions/mean_terminated_length": 265.03125, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "entropy": 0.11645239777863026, + "epoch": 0.552, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.374436616897583, + "learning_rate": 1.9586678530366606e-06, + "loss": -0.0846, + "num_tokens": 2391779.0, + "reward": 9.554816246032715, + "reward_std": 6.905723571777344, + "rewards/fitness_reward/mean": 6.215795516967773, + "rewards/fitness_reward/std": 3.2936534881591797, + "rewards/kidney_reward/mean": 1.817582130432129, + "rewards/kidney_reward/std": 1.7202730178833008, + "rewards/length2tails_reward/mean": 0.6768839359283447, + "rewards/length2tails_reward/std": 0.3586890399456024, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.3537501096725464, + "rewards/thermo_reward/std": 2.714689254760742, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.6875, + "completions/mean_terminated_length": 271.6875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11658206302672625, + "epoch": 0.554, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15221410989761353, + "learning_rate": 1.9583022201859483e-06, + "loss": -0.0003, + "num_tokens": 2400505.0, + "reward": 9.753368377685547, + "reward_std": 6.594207286834717, + "rewards/fitness_reward/mean": 6.377725124359131, + "rewards/fitness_reward/std": 3.114637613296509, + "rewards/kidney_reward/mean": 1.8250882625579834, + "rewards/kidney_reward/std": 1.6681197881698608, + "rewards/length2tails_reward/mean": 0.6662698984146118, + "rewards/length2tails_reward/std": 0.3910340368747711, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.383927583694458, + "rewards/thermo_reward/std": 2.8232386112213135, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.875, + "completions/mean_terminated_length": 269.875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10668938141316175, + "epoch": 0.556, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05785488709807396, + "learning_rate": 1.9579350116480196e-06, + "loss": 0.0003, + "num_tokens": 2409173.0, + "reward": 10.6722993850708, + "reward_std": 6.572075366973877, + "rewards/fitness_reward/mean": 6.304146766662598, + "rewards/fitness_reward/std": 3.339707612991333, + "rewards/kidney_reward/mean": 2.1426329612731934, + "rewards/kidney_reward/std": 1.4806450605392456, + "rewards/length2tails_reward/mean": 0.6038267612457275, + "rewards/length2tails_reward/std": 0.3453625440597534, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.065136432647705, + "rewards/thermo_reward/std": 2.26564884185791, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 270.9375, + "completions/mean_terminated_length": 270.9375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10630720760673285, + "epoch": 0.558, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.449964702129364, + "learning_rate": 1.9575662280266574e-06, + "loss": 0.0062, + "num_tokens": 2417875.0, + "reward": 11.194704055786133, + "reward_std": 5.337928771972656, + "rewards/fitness_reward/mean": 6.6108551025390625, + "rewards/fitness_reward/std": 2.733341932296753, + "rewards/kidney_reward/mean": 2.1053543090820312, + "rewards/kidney_reward/std": 1.3551630973815918, + "rewards/length2tails_reward/mean": 0.6207993626594543, + "rewards/length2tails_reward/std": 0.3310246765613556, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.3164145946502686, + "rewards/thermo_reward/std": 2.2865805625915527, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.78125, + "completions/mean_terminated_length": 270.78125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11214358266443014, + "epoch": 0.56, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06810147315263748, + "learning_rate": 1.957195869928234e-06, + "loss": -0.0043, + "num_tokens": 2426572.0, + "reward": 10.33513355255127, + "reward_std": 5.696887969970703, + "rewards/fitness_reward/mean": 6.178849220275879, + "rewards/fitness_reward/std": 3.1863362789154053, + "rewards/kidney_reward/mean": 2.0156962871551514, + "rewards/kidney_reward/std": 1.3245792388916016, + "rewards/length2tails_reward/mean": 0.6156526803970337, + "rewards/length2tails_reward/std": 0.37771496176719666, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.9790230989456177, + "rewards/thermo_reward/std": 2.088127374649048, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.40625, + "completions/mean_terminated_length": 271.40625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1069456310942769, + "epoch": 0.562, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08374135196208954, + "learning_rate": 1.9568239379617085e-06, + "loss": -0.0087, + "num_tokens": 2435289.0, + "reward": 11.6463623046875, + "reward_std": 3.750762939453125, + "rewards/fitness_reward/mean": 7.188657283782959, + "rewards/fitness_reward/std": 0.5449937582015991, + "rewards/kidney_reward/mean": 2.1691274642944336, + "rewards/kidney_reward/std": 1.278011679649353, + "rewards/length2tails_reward/mean": 0.6427605152130127, + "rewards/length2tails_reward/std": 0.36037594079971313, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.1243011951446533, + "rewards/thermo_reward/std": 2.355400800704956, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.3125, + "completions/mean_terminated_length": 271.3125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1080938633531332, + "epoch": 0.564, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05745893344283104, + "learning_rate": 1.9564504327386314e-06, + "loss": -0.0014, + "num_tokens": 2444003.0, + "reward": 10.508401870727539, + "reward_std": 6.477418899536133, + "rewards/fitness_reward/mean": 6.332326889038086, + "rewards/fitness_reward/std": 3.255727767944336, + "rewards/kidney_reward/mean": 2.0130109786987305, + "rewards/kidney_reward/std": 1.4932106733322144, + "rewards/length2tails_reward/mean": 0.6512283086776733, + "rewards/length2tails_reward/std": 0.3932029604911804, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.9979419708251953, + "rewards/thermo_reward/std": 2.280850648880005, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.25, + "completions/mean_terminated_length": 270.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1069063525646925, + "epoch": 0.566, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06927718967199326, + "learning_rate": 1.956075354873137e-06, + "loss": -0.0039, + "num_tokens": 2452683.0, + "reward": 9.580037117004395, + "reward_std": 6.869331359863281, + "rewards/fitness_reward/mean": 6.043339729309082, + "rewards/fitness_reward/std": 3.5511674880981445, + "rewards/kidney_reward/mean": 1.8608475923538208, + "rewards/kidney_reward/std": 1.5245518684387207, + "rewards/length2tails_reward/mean": 0.5623090863227844, + "rewards/length2tails_reward/std": 0.3931002616882324, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 1.5258690118789673, + "rewards/thermo_reward/std": 2.6764273643493652, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.46875, + "completions/mean_terminated_length": 270.46875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10221463162451982, + "epoch": 0.568, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10410986095666885, + "learning_rate": 1.9556987049819476e-06, + "loss": -0.0031, + "num_tokens": 2461370.0, + "reward": 8.779458999633789, + "reward_std": 8.178945541381836, + "rewards/fitness_reward/mean": 5.395164489746094, + "rewards/fitness_reward/std": 4.234164714813232, + "rewards/kidney_reward/mean": 1.618592381477356, + "rewards/kidney_reward/std": 1.9761297702789307, + "rewards/length2tails_reward/mean": 0.6146419048309326, + "rewards/length2tails_reward/std": 0.38370072841644287, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.6042392253875732, + "rewards/thermo_reward/std": 2.7575721740722656, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.9375, + "completions/mean_terminated_length": 269.9375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10469623561948538, + "epoch": 0.57, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11121048033237457, + "learning_rate": 1.9553204836843688e-06, + "loss": -0.0079, + "num_tokens": 2470040.0, + "reward": 11.497368812561035, + "reward_std": 4.967898368835449, + "rewards/fitness_reward/mean": 6.964468955993652, + "rewards/fitness_reward/std": 1.9358503818511963, + "rewards/kidney_reward/mean": 2.1352062225341797, + "rewards/kidney_reward/std": 1.271851897239685, + "rewards/length2tails_reward/mean": 0.529200553894043, + "rewards/length2tails_reward/std": 0.424444317817688, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.2447729110717773, + "rewards/thermo_reward/std": 2.451089382171631, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.71875, + "completions/mean_terminated_length": 270.71875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11302467249333858, + "epoch": 0.572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0896771252155304, + "learning_rate": 1.9549406916022904e-06, + "loss": -0.0066, + "num_tokens": 2478735.0, + "reward": 10.75018310546875, + "reward_std": 6.041914939880371, + "rewards/fitness_reward/mean": 6.624394416809082, + "rewards/fitness_reward/std": 2.682888984680176, + "rewards/kidney_reward/mean": 2.046433448791504, + "rewards/kidney_reward/std": 1.4409186840057373, + "rewards/length2tails_reward/mean": 0.6032804250717163, + "rewards/length2tails_reward/std": 0.40032944083213806, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.9190268516540527, + "rewards/thermo_reward/std": 2.572425127029419, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 270.09375, + "completions/mean_terminated_length": 270.09375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09249001834541559, + "epoch": 0.574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07904231548309326, + "learning_rate": 1.954559329360185e-06, + "loss": -0.0011, + "num_tokens": 2487410.0, + "reward": 9.763313293457031, + "reward_std": 6.718929767608643, + "rewards/fitness_reward/mean": 6.160808563232422, + "rewards/fitness_reward/std": 3.4212520122528076, + "rewards/kidney_reward/mean": 1.9048984050750732, + "rewards/kidney_reward/std": 1.5606497526168823, + "rewards/length2tails_reward/mean": 0.516953706741333, + "rewards/length2tails_reward/std": 0.39459678530693054, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.5459113121032715, + "rewards/thermo_reward/std": 2.662203788757324, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.5625, + "completions/mean_terminated_length": 271.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10279795341193676, + "epoch": 0.576, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07141412794589996, + "learning_rate": 1.954176397585107e-06, + "loss": -0.0029, + "num_tokens": 2496132.0, + "reward": 9.95760726928711, + "reward_std": 6.590401649475098, + "rewards/fitness_reward/mean": 6.230714321136475, + "rewards/fitness_reward/std": 3.2086310386657715, + "rewards/kidney_reward/mean": 1.7944406270980835, + "rewards/kidney_reward/std": 1.552512288093567, + "rewards/length2tails_reward/mean": 0.6762110590934753, + "rewards/length2tails_reward/std": 0.3639642894268036, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.7648320198059082, + "rewards/thermo_reward/std": 2.7508981227874756, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.15625, + "completions/mean_terminated_length": 270.15625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09227543417364359, + "epoch": 0.578, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08819398283958435, + "learning_rate": 1.953791896906692e-06, + "loss": -0.0006, + "num_tokens": 2504809.0, + "reward": 8.959890365600586, + "reward_std": 6.8027191162109375, + "rewards/fitness_reward/mean": 6.060881614685059, + "rewards/fitness_reward/std": 3.4265811443328857, + "rewards/kidney_reward/mean": 1.7525863647460938, + "rewards/kidney_reward/std": 1.6423976421356201, + "rewards/length2tails_reward/mean": 0.5825626850128174, + "rewards/length2tails_reward/std": 0.36792027950286865, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.988166868686676, + "rewards/thermo_reward/std": 2.7589187622070312, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.875, + "completions/mean_terminated_length": 270.875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1035487987101078, + "epoch": 0.58, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.060150545090436935, + "learning_rate": 1.9534058279571543e-06, + "loss": -0.0034, + "num_tokens": 2513509.0, + "reward": 11.565305709838867, + "reward_std": 4.800674915313721, + "rewards/fitness_reward/mean": 6.949488639831543, + "rewards/fitness_reward/std": 2.019439697265625, + "rewards/kidney_reward/mean": 2.3376946449279785, + "rewards/kidney_reward/std": 1.153743028640747, + "rewards/length2tails_reward/mean": 0.5994836688041687, + "rewards/length2tails_reward/std": 0.3866693377494812, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.118173599243164, + "rewards/thermo_reward/std": 2.375201463699341, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.6875, + "completions/mean_terminated_length": 270.6875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10638982523232698, + "epoch": 0.582, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09418369829654694, + "learning_rate": 1.953018191371287e-06, + "loss": -0.0008, + "num_tokens": 2522203.0, + "reward": 8.354490280151367, + "reward_std": 8.561019897460938, + "rewards/fitness_reward/mean": 5.269381523132324, + "rewards/fitness_reward/std": 4.298111438751221, + "rewards/kidney_reward/mean": 1.5763875246047974, + "rewards/kidney_reward/std": 1.9843838214874268, + "rewards/length2tails_reward/mean": 0.6417949199676514, + "rewards/length2tails_reward/std": 0.36956968903541565, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 1.3507918119430542, + "rewards/thermo_reward/std": 2.8522260189056396, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.25, + "completions/mean_terminated_length": 270.25, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09572751075029373, + "epoch": 0.584, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11383271962404251, + "learning_rate": 1.9526289877864616e-06, + "loss": -0.0036, + "num_tokens": 2530883.0, + "reward": 8.888689994812012, + "reward_std": 7.696630477905273, + "rewards/fitness_reward/mean": 5.718735694885254, + "rewards/fitness_reward/std": 3.7902793884277344, + "rewards/kidney_reward/mean": 1.564140796661377, + "rewards/kidney_reward/std": 1.957545518875122, + "rewards/length2tails_reward/mean": 0.5904607176780701, + "rewards/length2tails_reward/std": 0.3688611686229706, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.446768045425415, + "rewards/thermo_reward/std": 3.0040109157562256, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.21875, + "completions/mean_terminated_length": 270.21875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.11232748720794916, + "epoch": 0.586, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08226846903562546, + "learning_rate": 1.9522382178426256e-06, + "loss": -0.0042, + "num_tokens": 2539562.0, + "reward": 10.394163131713867, + "reward_std": 6.431159973144531, + "rewards/fitness_reward/mean": 6.652461528778076, + "rewards/fitness_reward/std": 2.7887954711914062, + "rewards/kidney_reward/mean": 1.8936915397644043, + "rewards/kidney_reward/std": 1.7695732116699219, + "rewards/length2tails_reward/mean": 0.5431311130523682, + "rewards/length2tails_reward/std": 0.41602662205696106, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.6936962604522705, + "rewards/thermo_reward/std": 2.7570669651031494, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.21875, + "completions/mean_terminated_length": 272.21875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10617440287023783, + "epoch": 0.588, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1224871352314949, + "learning_rate": 1.9518458821823017e-06, + "loss": -0.0029, + "num_tokens": 2548305.0, + "reward": 12.160886764526367, + "reward_std": 3.0531609058380127, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.650642991065979, + "rewards/kidney_reward/mean": 2.3008062839508057, + "rewards/kidney_reward/std": 0.9739616513252258, + "rewards/length2tails_reward/mean": 0.7086777687072754, + "rewards/length2tails_reward/std": 0.36278241872787476, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.4430453777313232, + "rewards/thermo_reward/std": 1.9213484525680542, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.53125, + "completions/mean_terminated_length": 270.53125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09158930648118258, + "epoch": 0.59, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06286156922578812, + "learning_rate": 1.9514519814505873e-06, + "loss": -0.0031, + "num_tokens": 2556994.0, + "reward": 10.294597625732422, + "reward_std": 6.84071159362793, + "rewards/fitness_reward/mean": 6.325943470001221, + "rewards/fitness_reward/std": 3.271643877029419, + "rewards/kidney_reward/mean": 1.911651611328125, + "rewards/kidney_reward/std": 1.6634387969970703, + "rewards/length2tails_reward/mean": 0.562515139579773, + "rewards/length2tails_reward/std": 0.42633259296417236, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.9007506370544434, + "rewards/thermo_reward/std": 2.5632848739624023, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 268.90625, + "completions/mean_terminated_length": 268.90625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09582995343953371, + "epoch": 0.592, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06275047361850739, + "learning_rate": 1.9510565162951534e-06, + "loss": -0.0034, + "num_tokens": 2565631.0, + "reward": 10.032028198242188, + "reward_std": 6.264020919799805, + "rewards/fitness_reward/mean": 6.083130836486816, + "rewards/fitness_reward/std": 3.4429848194122314, + "rewards/kidney_reward/mean": 1.9887399673461914, + "rewards/kidney_reward/std": 1.3645800352096558, + "rewards/length2tails_reward/mean": 0.47472113370895386, + "rewards/length2tails_reward/std": 0.4072186350822449, + "rewards/repeated_in_batch_reward/mean": 0.90625, + "rewards/repeated_in_batch_reward/std": 0.2961445748806, + "rewards/thermo_reward/mean": 1.822059988975525, + "rewards/thermo_reward/std": 2.248640775680542, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.03125, + "completions/mean_terminated_length": 272.03125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11088305059820414, + "epoch": 0.594, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08249466121196747, + "learning_rate": 1.9506594873662434e-06, + "loss": -0.0015, + "num_tokens": 2574368.0, + "reward": 11.153301239013672, + "reward_std": 6.116630554199219, + "rewards/fitness_reward/mean": 6.5505523681640625, + "rewards/fitness_reward/std": 2.78484845161438, + "rewards/kidney_reward/mean": 2.10675311088562, + "rewards/kidney_reward/std": 1.4828115701675415, + "rewards/length2tails_reward/mean": 0.6954585313796997, + "rewards/length2tails_reward/std": 0.37617215514183044, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.3264503479003906, + "rewards/thermo_reward/std": 2.2143211364746094, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.59375, + "completions/mean_terminated_length": 270.59375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1028360053896904, + "epoch": 0.596, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05633142590522766, + "learning_rate": 1.950260895316671e-06, + "loss": -0.0029, + "num_tokens": 2583059.0, + "reward": 10.895092010498047, + "reward_std": 5.3882575035095215, + "rewards/fitness_reward/mean": 6.57216215133667, + "rewards/fitness_reward/std": 2.8825032711029053, + "rewards/kidney_reward/mean": 2.1237099170684814, + "rewards/kidney_reward/std": 1.1620488166809082, + "rewards/length2tails_reward/mean": 0.6051574945449829, + "rewards/length2tails_reward/std": 0.3702513575553894, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.0387041568756104, + "rewards/thermo_reward/std": 2.117142677307129, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.40625, + "completions/mean_terminated_length": 271.40625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10438697133213282, + "epoch": 0.598, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26576927304267883, + "learning_rate": 1.94986074080182e-06, + "loss": -0.0065, + "num_tokens": 2591776.0, + "reward": 10.72160530090332, + "reward_std": 6.307686805725098, + "rewards/fitness_reward/mean": 6.244356155395508, + "rewards/fitness_reward/std": 3.3582611083984375, + "rewards/kidney_reward/mean": 1.9709126949310303, + "rewards/kidney_reward/std": 1.461074709892273, + "rewards/length2tails_reward/mean": 0.6270080804824829, + "rewards/length2tails_reward/std": 0.3762587010860443, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.343636989593506, + "rewards/thermo_reward/std": 2.063184976577759, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.0625, + "completions/mean_terminated_length": 271.0625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1024255147203803, + "epoch": 0.6, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0735434740781784, + "learning_rate": 1.9494590244796457e-06, + "loss": -0.001, + "num_tokens": 2600482.0, + "reward": 10.59252643585205, + "reward_std": 6.816814422607422, + "rewards/fitness_reward/mean": 6.158326625823975, + "rewards/fitness_reward/std": 3.2900969982147217, + "rewards/kidney_reward/mean": 1.8844225406646729, + "rewards/kidney_reward/std": 1.6350239515304565, + "rewards/length2tails_reward/mean": 0.6254887580871582, + "rewards/length2tails_reward/std": 0.36850452423095703, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.3872289657592773, + "rewards/thermo_reward/std": 2.259819746017456, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.3125, + "completions/mean_terminated_length": 269.3125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09671336691826582, + "epoch": 0.602, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06916102021932602, + "learning_rate": 1.9490557470106686e-06, + "loss": -0.0066, + "num_tokens": 2609132.0, + "reward": 11.148724555969238, + "reward_std": 3.967028856277466, + "rewards/fitness_reward/mean": 6.99554443359375, + "rewards/fitness_reward/std": 1.7628074884414673, + "rewards/kidney_reward/mean": 2.1857385635375977, + "rewards/kidney_reward/std": 0.9768639206886292, + "rewards/length2tails_reward/mean": 0.5360361933708191, + "rewards/length2tails_reward/std": 0.3558644950389862, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.8138371706008911, + "rewards/thermo_reward/std": 2.337616443634033, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.96875, + "completions/mean_terminated_length": 269.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11073332466185093, + "epoch": 0.604, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1010536327958107, + "learning_rate": 1.9486509090579775e-06, + "loss": 0.0004, + "num_tokens": 2617803.0, + "reward": 10.049813270568848, + "reward_std": 7.127676010131836, + "rewards/fitness_reward/mean": 6.250378608703613, + "rewards/fitness_reward/std": 3.3384571075439453, + "rewards/kidney_reward/mean": 1.8567582368850708, + "rewards/kidney_reward/std": 1.7774019241333008, + "rewards/length2tails_reward/mean": 0.5704126358032227, + "rewards/length2tails_reward/std": 0.3750966191291809, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.7856345176696777, + "rewards/thermo_reward/std": 2.6759543418884277, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 270.6875, + "completions/mean_terminated_length": 270.6875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.10591023601591587, + "epoch": 0.606, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09401101619005203, + "learning_rate": 1.948244511287226e-06, + "loss": -0.0038, + "num_tokens": 2626497.0, + "reward": 12.200855255126953, + "reward_std": 2.750121831893921, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.3575549125671387, + "rewards/kidney_reward/std": 0.6605173945426941, + "rewards/length2tails_reward/mean": 0.6274683475494385, + "rewards/length2tails_reward/std": 0.32595279812812805, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.3768768310546875, + "rewards/thermo_reward/std": 2.088259220123291, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.10753176920115948, + "epoch": 0.608, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05684461444616318, + "learning_rate": 1.9478365543666344e-06, + "loss": 0.0025, + "num_tokens": 2635213.0, + "reward": 11.500574111938477, + "reward_std": 6.049794673919678, + "rewards/fitness_reward/mean": 6.65021276473999, + "rewards/fitness_reward/std": 2.7978789806365967, + "rewards/kidney_reward/mean": 2.128681182861328, + "rewards/kidney_reward/std": 1.4498189687728882, + "rewards/length2tails_reward/mean": 0.7100331783294678, + "rewards/length2tails_reward/std": 0.2788810729980469, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.5506768226623535, + "rewards/thermo_reward/std": 2.18919038772583, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.6875, + "completions/mean_terminated_length": 270.6875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10265343729406595, + "epoch": 0.61, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07681909203529358, + "learning_rate": 1.947427038966984e-06, + "loss": 0.0004, + "num_tokens": 2643907.0, + "reward": 11.134979248046875, + "reward_std": 6.3709282875061035, + "rewards/fitness_reward/mean": 6.6581268310546875, + "rewards/fitness_reward/std": 2.766512632369995, + "rewards/kidney_reward/mean": 2.0494179725646973, + "rewards/kidney_reward/std": 1.6289461851119995, + "rewards/length2tails_reward/mean": 0.6452488899230957, + "rewards/length2tails_reward/std": 0.35534244775772095, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.262909412384033, + "rewards/thermo_reward/std": 2.480961799621582, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.0625, + "completions/mean_terminated_length": 271.0625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10664802324026823, + "epoch": 0.612, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06387440115213394, + "learning_rate": 1.947015965761621e-06, + "loss": 0.0015, + "num_tokens": 2652613.0, + "reward": 9.361135482788086, + "reward_std": 7.873195648193359, + "rewards/fitness_reward/mean": 5.945728302001953, + "rewards/fitness_reward/std": 3.80493426322937, + "rewards/kidney_reward/mean": 1.7331231832504272, + "rewards/kidney_reward/std": 1.8674038648605347, + "rewards/length2tails_reward/mean": 0.6784215569496155, + "rewards/length2tails_reward/std": 0.3497619032859802, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.5144407749176025, + "rewards/thermo_reward/std": 2.8010506629943848, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.59375, + "completions/mean_terminated_length": 270.59375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1120320213958621, + "epoch": 0.614, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15682992339134216, + "learning_rate": 1.946603335426452e-06, + "loss": 0.0017, + "num_tokens": 2661304.0, + "reward": 10.289670944213867, + "reward_std": 5.526130676269531, + "rewards/fitness_reward/mean": 6.585597038269043, + "rewards/fitness_reward/std": 2.614732027053833, + "rewards/kidney_reward/mean": 2.012620449066162, + "rewards/kidney_reward/std": 1.3148826360702515, + "rewards/length2tails_reward/mean": 0.6058496832847595, + "rewards/length2tails_reward/std": 0.3908247947692871, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.5308687686920166, + "rewards/thermo_reward/std": 2.6803879737854004, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.25, + "completions/mean_terminated_length": 271.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11361629422754049, + "epoch": 0.616, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11039303243160248, + "learning_rate": 1.946189148639943e-06, + "loss": -0.0011, + "num_tokens": 2670016.0, + "reward": 10.747947692871094, + "reward_std": 6.47308874130249, + "rewards/fitness_reward/mean": 6.574423789978027, + "rewards/fitness_reward/std": 2.8734307289123535, + "rewards/kidney_reward/mean": 2.028642177581787, + "rewards/kidney_reward/std": 1.6022999286651611, + "rewards/length2tails_reward/mean": 0.6725289821624756, + "rewards/length2tails_reward/std": 0.3522748649120331, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.977628469467163, + "rewards/thermo_reward/std": 2.5821573734283447, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.5625, + "completions/mean_terminated_length": 271.5625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10842048842459917, + "epoch": 0.618, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07693401724100113, + "learning_rate": 1.94577340608312e-06, + "loss": 0.001, + "num_tokens": 2678738.0, + "reward": 9.216243743896484, + "reward_std": 8.630293846130371, + "rewards/fitness_reward/mean": 5.8715739250183105, + "rewards/fitness_reward/std": 4.0133185386657715, + "rewards/kidney_reward/mean": 1.5296517610549927, + "rewards/kidney_reward/std": 2.1336543560028076, + "rewards/length2tails_reward/mean": 0.7188667058944702, + "rewards/length2tails_reward/std": 0.3229285180568695, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.6431313753128052, + "rewards/thermo_reward/std": 2.9635863304138184, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.21875, + "completions/mean_terminated_length": 269.21875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10129899997264147, + "epoch": 0.62, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09164420515298843, + "learning_rate": 1.9453561084395687e-06, + "loss": -0.0067, + "num_tokens": 2687385.0, + "reward": 10.077606201171875, + "reward_std": 4.365948677062988, + "rewards/fitness_reward/mean": 6.684309482574463, + "rewards/fitness_reward/std": 2.4438283443450928, + "rewards/kidney_reward/mean": 2.011162281036377, + "rewards/kidney_reward/std": 1.0672303438186646, + "rewards/length2tails_reward/mean": 0.4423019289970398, + "rewards/length2tails_reward/std": 0.39656853675842285, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.2379043102264404, + "rewards/thermo_reward/std": 2.4710021018981934, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.25, + "completions/mean_terminated_length": 271.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10645945090800524, + "epoch": 0.622, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12628157436847687, + "learning_rate": 1.944937256395429e-06, + "loss": -0.006, + "num_tokens": 2696097.0, + "reward": 7.2259135246276855, + "reward_std": 7.998942852020264, + "rewards/fitness_reward/mean": 5.124382019042969, + "rewards/fitness_reward/std": 4.213342666625977, + "rewards/kidney_reward/mean": 1.2453505992889404, + "rewards/kidney_reward/std": 2.059305429458618, + "rewards/length2tails_reward/mean": 0.6736060380935669, + "rewards/length2tails_reward/std": 0.37152162194252014, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.6888208389282227, + "rewards/thermo_reward/std": 2.8730380535125732, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.71875, + "completions/mean_terminated_length": 272.71875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10902046225965023, + "epoch": 0.624, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08679524809122086, + "learning_rate": 1.9445168506393986e-06, + "loss": 0.0039, + "num_tokens": 2704856.0, + "reward": 11.37441349029541, + "reward_std": 5.79127311706543, + "rewards/fitness_reward/mean": 6.695094585418701, + "rewards/fitness_reward/std": 2.622786521911621, + "rewards/kidney_reward/mean": 2.197542667388916, + "rewards/kidney_reward/std": 1.2092877626419067, + "rewards/length2tails_reward/mean": 0.7713974118232727, + "rewards/length2tails_reward/std": 0.2953493297100067, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.3046369552612305, + "rewards/thermo_reward/std": 2.519876003265381, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.59375, + "completions/mean_terminated_length": 269.59375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09812906850129366, + "epoch": 0.626, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07246968150138855, + "learning_rate": 1.944094891862728e-06, + "loss": -0.0018, + "num_tokens": 2713515.0, + "reward": 10.279682159423828, + "reward_std": 6.380427360534668, + "rewards/fitness_reward/mean": 6.306829452514648, + "rewards/fitness_reward/std": 3.1515753269195557, + "rewards/kidney_reward/mean": 1.8959795236587524, + "rewards/kidney_reward/std": 1.5268750190734863, + "rewards/length2tails_reward/mean": 0.5150830745697021, + "rewards/length2tails_reward/std": 0.39167141914367676, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.9253649711608887, + "rewards/thermo_reward/std": 2.3617825508117676, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.96875, + "completions/mean_terminated_length": 269.96875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09926465712487698, + "epoch": 0.628, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06662730872631073, + "learning_rate": 1.9436713807592232e-06, + "loss": 0.0017, + "num_tokens": 2722186.0, + "reward": 10.776936531066895, + "reward_std": 6.38076639175415, + "rewards/fitness_reward/mean": 6.596822738647461, + "rewards/fitness_reward/std": 2.7853715419769287, + "rewards/kidney_reward/mean": 1.8975119590759277, + "rewards/kidney_reward/std": 1.7479808330535889, + "rewards/length2tails_reward/mean": 0.6078963279724121, + "rewards/length2tails_reward/std": 0.3126949965953827, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.121812582015991, + "rewards/thermo_reward/std": 2.486586332321167, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.59375, + "completions/mean_terminated_length": 271.59375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10481654526665807, + "epoch": 0.63, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08617524802684784, + "learning_rate": 1.943246318025242e-06, + "loss": -0.0013, + "num_tokens": 2730909.0, + "reward": 11.040285110473633, + "reward_std": 5.796336650848389, + "rewards/fitness_reward/mean": 6.635307312011719, + "rewards/fitness_reward/std": 2.6435258388519287, + "rewards/kidney_reward/mean": 1.8447988033294678, + "rewards/kidney_reward/std": 1.6061389446258545, + "rewards/length2tails_reward/mean": 0.6512874364852905, + "rewards/length2tails_reward/std": 0.38937750458717346, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.395050525665283, + "rewards/thermo_reward/std": 2.299553155899048, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.5, + "completions/mean_terminated_length": 271.5, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11112393904477358, + "epoch": 0.632, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10392550379037857, + "learning_rate": 1.9428197043596927e-06, + "loss": -0.0003, + "num_tokens": 2739629.0, + "reward": 10.80946159362793, + "reward_std": 6.718322277069092, + "rewards/fitness_reward/mean": 6.284722805023193, + "rewards/fitness_reward/std": 3.402113437652588, + "rewards/kidney_reward/mean": 2.026055335998535, + "rewards/kidney_reward/std": 1.583153247833252, + "rewards/length2tails_reward/mean": 0.6818006038665771, + "rewards/length2tails_reward/std": 0.33342015743255615, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.330503463745117, + "rewards/thermo_reward/std": 2.639218807220459, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.46875, + "completions/mean_terminated_length": 271.46875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10043955408036709, + "epoch": 0.634, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07863548398017883, + "learning_rate": 1.9423915404640348e-06, + "loss": -0.008, + "num_tokens": 2748348.0, + "reward": 11.660589218139648, + "reward_std": 4.825669765472412, + "rewards/fitness_reward/mean": 6.682684898376465, + "rewards/fitness_reward/std": 2.681821346282959, + "rewards/kidney_reward/mean": 2.245955467224121, + "rewards/kidney_reward/std": 1.1798933744430542, + "rewards/length2tails_reward/mean": 0.635576605796814, + "rewards/length2tails_reward/std": 0.39163753390312195, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.5683908462524414, + "rewards/thermo_reward/std": 2.1094815731048584, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11370659247040749, + "epoch": 0.636, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08406565338373184, + "learning_rate": 1.9419618270422753e-06, + "loss": -0.0037, + "num_tokens": 2757069.0, + "reward": 9.857394218444824, + "reward_std": 6.708688735961914, + "rewards/fitness_reward/mean": 6.285477638244629, + "rewards/fitness_reward/std": 3.2216105461120605, + "rewards/kidney_reward/mean": 1.8642642498016357, + "rewards/kidney_reward/std": 1.6696314811706543, + "rewards/length2tails_reward/mean": 0.7178899645805359, + "rewards/length2tails_reward/std": 0.3267357647418976, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.535862922668457, + "rewards/thermo_reward/std": 2.7389419078826904, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.78125, + "completions/mean_terminated_length": 271.78125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10528519842773676, + "epoch": 0.638, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14078086614608765, + "learning_rate": 1.9415305648009716e-06, + "loss": -0.003, + "num_tokens": 2765798.0, + "reward": 11.718783378601074, + "reward_std": 4.424414157867432, + "rewards/fitness_reward/mean": 6.930644512176514, + "rewards/fitness_reward/std": 2.1247165203094482, + "rewards/kidney_reward/mean": 2.121047019958496, + "rewards/kidney_reward/std": 1.3946641683578491, + "rewards/length2tails_reward/mean": 0.6864926815032959, + "rewards/length2tails_reward/std": 0.35047489404678345, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.4984426498413086, + "rewards/thermo_reward/std": 2.34663724899292, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.25, + "completions/mean_terminated_length": 270.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12153899669647217, + "epoch": 0.64, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8654022812843323, + "learning_rate": 1.9410977544492244e-06, + "loss": -0.003, + "num_tokens": 2774478.0, + "reward": 9.140295028686523, + "reward_std": 6.056877136230469, + "rewards/fitness_reward/mean": 6.590029716491699, + "rewards/fitness_reward/std": 2.633514881134033, + "rewards/kidney_reward/mean": 1.5470025539398193, + "rewards/kidney_reward/std": 1.5915534496307373, + "rewards/length2tails_reward/mean": 0.5990718603134155, + "rewards/length2tails_reward/std": 0.32859253883361816, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 0.8433558344841003, + "rewards/thermo_reward/std": 2.8439509868621826, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.375, + "completions/mean_terminated_length": 269.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10163491126149893, + "epoch": 0.642, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1465456634759903, + "learning_rate": 1.9406633966986824e-06, + "loss": 0.0035, + "num_tokens": 2783130.0, + "reward": 10.073744773864746, + "reward_std": 6.786154270172119, + "rewards/fitness_reward/mean": 6.615808486938477, + "rewards/fitness_reward/std": 2.9360740184783936, + "rewards/kidney_reward/mean": 1.804457426071167, + "rewards/kidney_reward/std": 1.8130881786346436, + "rewards/length2tails_reward/mean": 0.5035993456840515, + "rewards/length2tails_reward/std": 0.4101681113243103, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.5031180381774902, + "rewards/thermo_reward/std": 2.811406135559082, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.46875, + "completions/mean_terminated_length": 270.46875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11280594673007727, + "epoch": 0.644, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06963612139225006, + "learning_rate": 1.9402274922635376e-06, + "loss": -0.0029, + "num_tokens": 2791817.0, + "reward": 10.2340669631958, + "reward_std": 6.397792816162109, + "rewards/fitness_reward/mean": 6.016045570373535, + "rewards/fitness_reward/std": 3.631082773208618, + "rewards/kidney_reward/mean": 2.075089931488037, + "rewards/kidney_reward/std": 1.2228959798812866, + "rewards/length2tails_reward/mean": 0.5873444080352783, + "rewards/length2tails_reward/std": 0.3563327491283417, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 1.99044668674469, + "rewards/thermo_reward/std": 2.3727447986602783, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.75, + "completions/mean_terminated_length": 270.75, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09691578336060047, + "epoch": 0.646, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2780953049659729, + "learning_rate": 1.9397900418605256e-06, + "loss": -0.0086, + "num_tokens": 2800513.0, + "reward": 9.048253059387207, + "reward_std": 7.027248382568359, + "rewards/fitness_reward/mean": 6.214809894561768, + "rewards/fitness_reward/std": 3.2641353607177734, + "rewards/kidney_reward/mean": 1.5307283401489258, + "rewards/kidney_reward/std": 1.8468669652938843, + "rewards/length2tails_reward/mean": 0.5750716328620911, + "rewards/length2tails_reward/std": 0.3983921408653259, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.1452076435089111, + "rewards/thermo_reward/std": 2.8743062019348145, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.1875, + "completions/mean_terminated_length": 272.1875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11906621418893337, + "epoch": 0.648, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06410878151655197, + "learning_rate": 1.9393510462089237e-06, + "loss": -0.002, + "num_tokens": 2809255.0, + "reward": 11.604827880859375, + "reward_std": 5.0774712562561035, + "rewards/fitness_reward/mean": 6.653090953826904, + "rewards/fitness_reward/std": 2.569222927093506, + "rewards/kidney_reward/mean": 2.1688947677612305, + "rewards/kidney_reward/std": 1.2170603275299072, + "rewards/length2tails_reward/mean": 0.7435036301612854, + "rewards/length2tails_reward/std": 0.298550546169281, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.608492374420166, + "rewards/thermo_reward/std": 1.959001898765564, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.90625, + "completions/mean_terminated_length": 270.90625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10088578891009092, + "epoch": 0.65, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15146462619304657, + "learning_rate": 1.938910506030549e-06, + "loss": -0.0078, + "num_tokens": 2817956.0, + "reward": 11.327554702758789, + "reward_std": 4.425415992736816, + "rewards/fitness_reward/mean": 6.687413215637207, + "rewards/fitness_reward/std": 2.4316930770874023, + "rewards/kidney_reward/mean": 2.190837860107422, + "rewards/kidney_reward/std": 0.8751612305641174, + "rewards/length2tails_reward/mean": 0.5867102146148682, + "rewards/length2tails_reward/std": 0.39029496908187866, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 2.2968828678131104, + "rewards/thermo_reward/std": 1.9458266496658325, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.71875, + "completions/mean_terminated_length": 271.71875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.11654944997280836, + "epoch": 0.652, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09489252418279648, + "learning_rate": 1.9384684220497604e-06, + "loss": -0.0039, + "num_tokens": 2826683.0, + "reward": 10.793950080871582, + "reward_std": 6.374341011047363, + "rewards/fitness_reward/mean": 6.582357406616211, + "rewards/fitness_reward/std": 2.842876434326172, + "rewards/kidney_reward/mean": 1.9244379997253418, + "rewards/kidney_reward/std": 1.603426218032837, + "rewards/length2tails_reward/mean": 0.7159044742584229, + "rewards/length2tails_reward/std": 0.3271859288215637, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.1155638694763184, + "rewards/thermo_reward/std": 2.4932732582092285, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 270.65625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10629504360258579, + "epoch": 0.654, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.38704410195350647, + "learning_rate": 1.938024794993453e-06, + "loss": 0.0021, + "num_tokens": 2835376.0, + "reward": 9.949481964111328, + "reward_std": 7.517545223236084, + "rewards/fitness_reward/mean": 6.169020652770996, + "rewards/fitness_reward/std": 3.3954830169677734, + "rewards/kidney_reward/mean": 1.7824015617370605, + "rewards/kidney_reward/std": 2.01423978805542, + "rewards/length2tails_reward/mean": 0.6802841424942017, + "rewards/length2tails_reward/std": 0.3347759246826172, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.8300318717956543, + "rewards/thermo_reward/std": 2.6380269527435303, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.46875, + "completions/mean_terminated_length": 271.46875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.11221757438033819, + "epoch": 0.656, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1447339504957199, + "learning_rate": 1.9375796255910604e-06, + "loss": 0.0012, + "num_tokens": 2844095.0, + "reward": 12.190248489379883, + "reward_std": 3.808412790298462, + "rewards/fitness_reward/mean": 7.046268939971924, + "rewards/fitness_reward/std": 1.7814339399337769, + "rewards/kidney_reward/mean": 2.4317309856414795, + "rewards/kidney_reward/std": 0.811180591583252, + "rewards/length2tails_reward/mean": 0.7027704119682312, + "rewards/length2tails_reward/std": 0.29092442989349365, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.5419716835021973, + "rewards/thermo_reward/std": 1.7265745401382446, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11657298356294632, + "epoch": 0.658, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7877811193466187, + "learning_rate": 1.937132914574552e-06, + "loss": -0.001, + "num_tokens": 2852811.0, + "reward": 9.436601638793945, + "reward_std": 7.611492156982422, + "rewards/fitness_reward/mean": 6.449830532073975, + "rewards/fitness_reward/std": 3.1732778549194336, + "rewards/kidney_reward/mean": 1.7130616903305054, + "rewards/kidney_reward/std": 2.137336492538452, + "rewards/length2tails_reward/mean": 0.6293182373046875, + "rewards/length2tails_reward/std": 0.3911236524581909, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.1107772588729858, + "rewards/thermo_reward/std": 3.128592014312744, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11583487037569284, + "epoch": 0.66, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13773338496685028, + "learning_rate": 1.936684662678432e-06, + "loss": -0.0028, + "num_tokens": 2861532.0, + "reward": 10.950634956359863, + "reward_std": 5.712095260620117, + "rewards/fitness_reward/mean": 6.851317405700684, + "rewards/fitness_reward/std": 2.2583343982696533, + "rewards/kidney_reward/mean": 1.919154167175293, + "rewards/kidney_reward/std": 1.6229445934295654, + "rewards/length2tails_reward/mean": 0.6816023588180542, + "rewards/length2tails_reward/std": 0.33754870295524597, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.012002944946289, + "rewards/thermo_reward/std": 2.5088531970977783, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.6875, + "completions/mean_terminated_length": 270.6875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10768833290785551, + "epoch": 0.662, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2022908627986908, + "learning_rate": 1.936234870639737e-06, + "loss": 0.0017, + "num_tokens": 2870226.0, + "reward": 8.697061538696289, + "reward_std": 8.931398391723633, + "rewards/fitness_reward/mean": 5.170741081237793, + "rewards/fitness_reward/std": 4.65004301071167, + "rewards/kidney_reward/mean": 1.6327917575836182, + "rewards/kidney_reward/std": 2.033492088317871, + "rewards/length2tails_reward/mean": 0.6267710328102112, + "rewards/length2tails_reward/std": 0.4012013375759125, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.7308508157730103, + "rewards/thermo_reward/std": 2.9484026432037354, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.8125, + "completions/mean_terminated_length": 269.8125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10277679562568665, + "epoch": 0.664, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09230952709913254, + "learning_rate": 1.935783539198038e-06, + "loss": -0.0007, + "num_tokens": 2878892.0, + "reward": 9.331253051757812, + "reward_std": 6.823907852172852, + "rewards/fitness_reward/mean": 6.272053241729736, + "rewards/fitness_reward/std": 3.266080617904663, + "rewards/kidney_reward/mean": 1.7818467617034912, + "rewards/kidney_reward/std": 1.7117173671722412, + "rewards/length2tails_reward/mean": 0.5891803503036499, + "rewards/length2tails_reward/std": 0.3721315562725067, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.118435263633728, + "rewards/thermo_reward/std": 2.897606134414673, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.90625, + "completions/mean_terminated_length": 271.90625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11460565775632858, + "epoch": 0.666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09108323603868484, + "learning_rate": 1.9353306690954357e-06, + "loss": -0.0014, + "num_tokens": 2887625.0, + "reward": 9.689314842224121, + "reward_std": 7.377963542938232, + "rewards/fitness_reward/mean": 6.1329803466796875, + "rewards/fitness_reward/std": 3.3433401584625244, + "rewards/kidney_reward/mean": 1.7278858423233032, + "rewards/kidney_reward/std": 1.962307095527649, + "rewards/length2tails_reward/mean": 0.7054315805435181, + "rewards/length2tails_reward/std": 0.32727426290512085, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.6579058170318604, + "rewards/thermo_reward/std": 2.702836751937866, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.96875, + "completions/mean_terminated_length": 270.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1053895577788353, + "epoch": 0.668, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08655820786952972, + "learning_rate": 1.9348762610765613e-06, + "loss": -0.0008, + "num_tokens": 2896328.0, + "reward": 10.018129348754883, + "reward_std": 6.529927730560303, + "rewards/fitness_reward/mean": 6.52497673034668, + "rewards/fitness_reward/std": 2.844560384750366, + "rewards/kidney_reward/mean": 1.8537702560424805, + "rewards/kidney_reward/std": 1.7908141613006592, + "rewards/length2tails_reward/mean": 0.6363632678985596, + "rewards/length2tails_reward/std": 0.3371298611164093, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.4757461547851562, + "rewards/thermo_reward/std": 2.5623059272766113, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.875, + "completions/mean_terminated_length": 270.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11338281631469727, + "epoch": 0.67, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0765661969780922, + "learning_rate": 1.934420315888575e-06, + "loss": 0.0009, + "num_tokens": 2905028.0, + "reward": 11.437816619873047, + "reward_std": 5.1707940101623535, + "rewards/fitness_reward/mean": 6.533980369567871, + "rewards/fitness_reward/std": 2.809199571609497, + "rewards/kidney_reward/mean": 2.2249608039855957, + "rewards/kidney_reward/std": 1.1669909954071045, + "rewards/length2tails_reward/mean": 0.5938228964805603, + "rewards/length2tails_reward/std": 0.3681739270687103, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.5194928646087646, + "rewards/thermo_reward/std": 2.083427667617798, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.4375, + "completions/mean_terminated_length": 269.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1042593028396368, + "epoch": 0.672, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13140137493610382, + "learning_rate": 1.933962834281163e-06, + "loss": 0.0004, + "num_tokens": 2913682.0, + "reward": 10.98188304901123, + "reward_std": 6.552918910980225, + "rewards/fitness_reward/mean": 6.586332321166992, + "rewards/fitness_reward/std": 2.8265011310577393, + "rewards/kidney_reward/mean": 1.8140132427215576, + "rewards/kidney_reward/std": 1.8791821002960205, + "rewards/length2tails_reward/mean": 0.5543262958526611, + "rewards/length2tails_reward/std": 0.3463633358478546, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.4261040687561035, + "rewards/thermo_reward/std": 2.4456305503845215, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.46875, + "completions/mean_terminated_length": 270.46875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10754356998950243, + "epoch": 0.674, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06656590104103088, + "learning_rate": 1.93350381700654e-06, + "loss": -0.0006, + "num_tokens": 2922369.0, + "reward": 11.799091339111328, + "reward_std": 5.279061317443848, + "rewards/fitness_reward/mean": 6.693284034729004, + "rewards/fitness_reward/std": 2.6365268230438232, + "rewards/kidney_reward/mean": 2.275409698486328, + "rewards/kidney_reward/std": 1.1351674795150757, + "rewards/length2tails_reward/mean": 0.633697509765625, + "rewards/length2tails_reward/std": 0.3460710048675537, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.6670281887054443, + "rewards/thermo_reward/std": 1.997921347618103, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.03125, + "completions/mean_terminated_length": 270.03125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.11916785500943661, + "epoch": 0.676, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06394181400537491, + "learning_rate": 1.933043264819444e-06, + "loss": -0.0055, + "num_tokens": 2931042.0, + "reward": 11.553443908691406, + "reward_std": 3.8259716033935547, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.2776575088500977, + "rewards/kidney_reward/std": 0.7665494084358215, + "rewards/length2tails_reward/mean": 0.6201182007789612, + "rewards/length2tails_reward/std": 0.35222068428993225, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.060720920562744, + "rewards/thermo_reward/std": 2.3281824588775635, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 271.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10639863647520542, + "epoch": 0.678, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10773520171642303, + "learning_rate": 1.932581178477138e-06, + "loss": -0.0045, + "num_tokens": 2939760.0, + "reward": 12.213360786437988, + "reward_std": 2.925508975982666, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.360633373260498, + "rewards/kidney_reward/std": 0.6457472443580627, + "rewards/length2tails_reward/mean": 0.700934648513794, + "rewards/length2tails_reward/std": 0.3516809046268463, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.3789587020874023, + "rewards/thermo_reward/std": 2.3317079544067383, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 271.75, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10935661476105452, + "epoch": 0.68, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06685982644557953, + "learning_rate": 1.9321175587394056e-06, + "loss": -0.0029, + "num_tokens": 2948488.0, + "reward": 11.586135864257812, + "reward_std": 5.158356666564941, + "rewards/fitness_reward/mean": 6.88628625869751, + "rewards/fitness_reward/std": 2.1171510219573975, + "rewards/kidney_reward/mean": 2.1452465057373047, + "rewards/kidney_reward/std": 1.2479134798049927, + "rewards/length2tails_reward/mean": 0.7242114543914795, + "rewards/length2tails_reward/std": 0.318882554769516, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.3821818828582764, + "rewards/thermo_reward/std": 2.5958993434906006, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.84375, + "completions/mean_terminated_length": 270.84375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.10775735974311829, + "epoch": 0.682, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05980192869901657, + "learning_rate": 1.9316524063685538e-06, + "loss": 0.0041, + "num_tokens": 2957187.0, + "reward": 10.59376049041748, + "reward_std": 7.244329452514648, + "rewards/fitness_reward/mean": 6.2857866287231445, + "rewards/fitness_reward/std": 3.397279977798462, + "rewards/kidney_reward/mean": 2.0456247329711914, + "rewards/kidney_reward/std": 1.749367356300354, + "rewards/length2tails_reward/mean": 0.683518648147583, + "rewards/length2tails_reward/std": 0.30251583456993103, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.0939974784851074, + "rewards/thermo_reward/std": 2.440037488937378, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.71875, + "completions/mean_terminated_length": 271.71875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10989144165068865, + "epoch": 0.684, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07758535444736481, + "learning_rate": 1.931185722129409e-06, + "loss": -0.0046, + "num_tokens": 2965914.0, + "reward": 11.012580871582031, + "reward_std": 5.557486057281494, + "rewards/fitness_reward/mean": 6.334482192993164, + "rewards/fitness_reward/std": 3.2543489933013916, + "rewards/kidney_reward/mean": 2.2728075981140137, + "rewards/kidney_reward/std": 0.9552741050720215, + "rewards/length2tails_reward/mean": 0.7059061527252197, + "rewards/length2tails_reward/std": 0.31230825185775757, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.2347002029418945, + "rewards/thermo_reward/std": 2.149380922317505, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.78125, + "completions/mean_terminated_length": 271.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11373002640902996, + "epoch": 0.686, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2745387852191925, + "learning_rate": 1.9307175067893163e-06, + "loss": -0.0038, + "num_tokens": 2974643.0, + "reward": 12.919220924377441, + "reward_std": 2.191603899002075, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.460087299346924, + "rewards/kidney_reward/std": 0.6578894257545471, + "rewards/length2tails_reward/mean": 0.6816491484642029, + "rewards/length2tails_reward/std": 0.3293251395225525, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.929783582687378, + "rewards/thermo_reward/std": 1.6954776048660278, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.3125, + "completions/mean_terminated_length": 272.3125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10979206673800945, + "epoch": 0.688, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16077430546283722, + "learning_rate": 1.9302477611181375e-06, + "loss": -0.0072, + "num_tokens": 2983389.0, + "reward": 10.854362487792969, + "reward_std": 5.402705669403076, + "rewards/fitness_reward/mean": 6.654323577880859, + "rewards/fitness_reward/std": 2.782104015350342, + "rewards/kidney_reward/mean": 2.0775442123413086, + "rewards/kidney_reward/std": 1.285241961479187, + "rewards/length2tails_reward/mean": 0.73471599817276, + "rewards/length2tails_reward/std": 0.3409052789211273, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.9490220546722412, + "rewards/thermo_reward/std": 2.4640939235687256, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10362495668232441, + "epoch": 0.69, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2713906466960907, + "learning_rate": 1.929776485888251e-06, + "loss": -0.0031, + "num_tokens": 2992120.0, + "reward": 9.594318389892578, + "reward_std": 7.927021503448486, + "rewards/fitness_reward/mean": 5.681684494018555, + "rewards/fitness_reward/std": 3.97599196434021, + "rewards/kidney_reward/mean": 1.7850568294525146, + "rewards/kidney_reward/std": 1.7634572982788086, + "rewards/length2tails_reward/mean": 0.6979485750198364, + "rewards/length2tails_reward/std": 0.35325416922569275, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.9577827453613281, + "rewards/thermo_reward/std": 2.700308084487915, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.46875, + "completions/mean_terminated_length": 271.46875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12234244868159294, + "epoch": 0.692, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08907873183488846, + "learning_rate": 1.9293036818745518e-06, + "loss": 0.0007, + "num_tokens": 3000839.0, + "reward": 9.967710494995117, + "reward_std": 7.248884201049805, + "rewards/fitness_reward/mean": 6.2207746505737305, + "rewards/fitness_reward/std": 3.420375108718872, + "rewards/kidney_reward/mean": 1.7617230415344238, + "rewards/kidney_reward/std": 1.7901252508163452, + "rewards/length2tails_reward/mean": 0.6708717346191406, + "rewards/length2tails_reward/std": 0.3487412929534912, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.8181260824203491, + "rewards/thermo_reward/std": 2.7099850177764893, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.34375, + "completions/mean_terminated_length": 271.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10335982125252485, + "epoch": 0.694, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08128883689641953, + "learning_rate": 1.9288293498544467e-06, + "loss": 0.0053, + "num_tokens": 3009554.0, + "reward": 9.647581100463867, + "reward_std": 7.206797122955322, + "rewards/fitness_reward/mean": 5.884220123291016, + "rewards/fitness_reward/std": 3.81189227104187, + "rewards/kidney_reward/mean": 1.8862109184265137, + "rewards/kidney_reward/std": 1.6567944288253784, + "rewards/length2tails_reward/mean": 0.6482487916946411, + "rewards/length2tails_reward/std": 0.37604981660842896, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.712324857711792, + "rewards/thermo_reward/std": 2.719363212585449, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.71875, + "completions/mean_terminated_length": 271.71875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09986438229680061, + "epoch": 0.696, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08426442742347717, + "learning_rate": 1.928353490607855e-06, + "loss": -0.0001, + "num_tokens": 3018281.0, + "reward": 10.494661331176758, + "reward_std": 6.4393086433410645, + "rewards/fitness_reward/mean": 6.288505554199219, + "rewards/fitness_reward/std": 3.211290121078491, + "rewards/kidney_reward/mean": 1.9297271966934204, + "rewards/kidney_reward/std": 1.5727407932281494, + "rewards/length2tails_reward/mean": 0.7330443263053894, + "rewards/length2tails_reward/std": 0.30768275260925293, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.103123664855957, + "rewards/thermo_reward/std": 2.4181745052337646, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.625, + "completions/mean_terminated_length": 270.625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10438127210363746, + "epoch": 0.698, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.745772659778595, + "learning_rate": 1.9278761049172087e-06, + "loss": -0.0061, + "num_tokens": 3026973.0, + "reward": 9.632065773010254, + "reward_std": 7.670244216918945, + "rewards/fitness_reward/mean": 5.907223224639893, + "rewards/fitness_reward/std": 3.7570505142211914, + "rewards/kidney_reward/mean": 1.6641534566879272, + "rewards/kidney_reward/std": 2.020569086074829, + "rewards/length2tails_reward/mean": 0.6299859285354614, + "rewards/length2tails_reward/std": 0.3401232361793518, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.8976908922195435, + "rewards/thermo_reward/std": 2.6126625537872314, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.84375, + "completions/mean_terminated_length": 270.84375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10190891660749912, + "epoch": 0.7, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0659673810005188, + "learning_rate": 1.927397193567448e-06, + "loss": -0.0023, + "num_tokens": 3035672.0, + "reward": 11.571775436401367, + "reward_std": 4.5618414878845215, + "rewards/fitness_reward/mean": 6.948239326477051, + "rewards/fitness_reward/std": 2.026416063308716, + "rewards/kidney_reward/mean": 2.2341322898864746, + "rewards/kidney_reward/std": 1.0163410902023315, + "rewards/length2tails_reward/mean": 0.6096740961074829, + "rewards/length2tails_reward/std": 0.37831398844718933, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.2284371852874756, + "rewards/thermo_reward/std": 2.161548137664795, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.71875, + "completions/mean_terminated_length": 270.71875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11552073899656534, + "epoch": 0.702, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5920554399490356, + "learning_rate": 1.9269167573460217e-06, + "loss": -0.0022, + "num_tokens": 3044367.0, + "reward": 10.978547096252441, + "reward_std": 6.401198863983154, + "rewards/fitness_reward/mean": 6.606268882751465, + "rewards/fitness_reward/std": 2.7593371868133545, + "rewards/kidney_reward/mean": 2.0434393882751465, + "rewards/kidney_reward/std": 1.6172335147857666, + "rewards/length2tails_reward/mean": 0.6069375276565552, + "rewards/length2tails_reward/std": 0.37754398584365845, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.168145179748535, + "rewards/thermo_reward/std": 2.4649102687835693, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.5, + "completions/mean_terminated_length": 271.5, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11476656049489975, + "epoch": 0.704, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06389094144105911, + "learning_rate": 1.926434797042887e-06, + "loss": -0.0016, + "num_tokens": 3053087.0, + "reward": 10.932390213012695, + "reward_std": 5.4943342208862305, + "rewards/fitness_reward/mean": 6.568968772888184, + "rewards/fitness_reward/std": 2.8947854042053223, + "rewards/kidney_reward/mean": 2.112169027328491, + "rewards/kidney_reward/std": 1.2832504510879517, + "rewards/length2tails_reward/mean": 0.685340166091919, + "rewards/length2tails_reward/std": 0.3321627676486969, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.082718849182129, + "rewards/thermo_reward/std": 2.573854446411133, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.46875, + "completions/mean_terminated_length": 271.46875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1000164458528161, + "epoch": 0.706, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11736225336790085, + "learning_rate": 1.9259513134505073e-06, + "loss": -0.0043, + "num_tokens": 3061806.0, + "reward": 9.778669357299805, + "reward_std": 6.8691511154174805, + "rewards/fitness_reward/mean": 5.954867362976074, + "rewards/fitness_reward/std": 3.625878095626831, + "rewards/kidney_reward/mean": 1.931515097618103, + "rewards/kidney_reward/std": 1.5763072967529297, + "rewards/length2tails_reward/mean": 0.66201251745224, + "rewards/length2tails_reward/std": 0.3817025125026703, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.7260853052139282, + "rewards/thermo_reward/std": 2.684684991836548, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10831321962177753, + "epoch": 0.708, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5270610451698303, + "learning_rate": 1.9254663073638492e-06, + "loss": -0.0037, + "num_tokens": 3070527.0, + "reward": 11.321937561035156, + "reward_std": 6.7053937911987305, + "rewards/fitness_reward/mean": 6.6266045570373535, + "rewards/fitness_reward/std": 2.8948702812194824, + "rewards/kidney_reward/mean": 1.9176154136657715, + "rewards/kidney_reward/std": 1.8548389673233032, + "rewards/length2tails_reward/mean": 0.694791316986084, + "rewards/length2tails_reward/std": 0.3296026885509491, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.608238458633423, + "rewards/thermo_reward/std": 2.3834054470062256, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11527068726718426, + "epoch": 0.71, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1380949169397354, + "learning_rate": 1.924979779580383e-06, + "loss": -0.004, + "num_tokens": 3079275.0, + "reward": 9.897174835205078, + "reward_std": 7.3936848640441895, + "rewards/fitness_reward/mean": 6.241214752197266, + "rewards/fitness_reward/std": 3.3536489009857178, + "rewards/kidney_reward/mean": 1.7082035541534424, + "rewards/kidney_reward/std": 1.9595487117767334, + "rewards/length2tails_reward/mean": 0.7306416630744934, + "rewards/length2tails_reward/std": 0.3336907625198364, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.7746928930282593, + "rewards/thermo_reward/std": 2.548712968826294, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 264.1875, + "completions/mean_terminated_length": 264.1875, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.11325905099511147, + "epoch": 0.712, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0523838996887207, + "learning_rate": 1.9244917309000816e-06, + "loss": -0.0836, + "num_tokens": 3087761.0, + "reward": 9.5830078125, + "reward_std": 7.340405464172363, + "rewards/fitness_reward/mean": 6.4399871826171875, + "rewards/fitness_reward/std": 2.9903438091278076, + "rewards/kidney_reward/mean": 1.6315879821777344, + "rewards/kidney_reward/std": 2.1313037872314453, + "rewards/length2tails_reward/mean": 0.6992827653884888, + "rewards/length2tails_reward/std": 0.29506129026412964, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.3415043354034424, + "rewards/thermo_reward/std": 3.074310302734375, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.78125, + "completions/mean_terminated_length": 270.78125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.1113341748714447, + "epoch": 0.714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.057726092636585236, + "learning_rate": 1.9240021621254186e-06, + "loss": -0.0009, + "num_tokens": 3096458.0, + "reward": 11.602544784545898, + "reward_std": 4.918376445770264, + "rewards/fitness_reward/mean": 6.883650779724121, + "rewards/fitness_reward/std": 2.0794548988342285, + "rewards/kidney_reward/mean": 2.2405173778533936, + "rewards/kidney_reward/std": 1.1649236679077148, + "rewards/length2tails_reward/mean": 0.6127895712852478, + "rewards/length2tails_reward/std": 0.34780457615852356, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.317099094390869, + "rewards/thermo_reward/std": 2.3603625297546387, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.03125, + "completions/mean_terminated_length": 270.03125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10471488256007433, + "epoch": 0.716, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13354143500328064, + "learning_rate": 1.9235110740613667e-06, + "loss": -0.0041, + "num_tokens": 3105131.0, + "reward": 11.289546966552734, + "reward_std": 6.243673324584961, + "rewards/fitness_reward/mean": 6.612612724304199, + "rewards/fitness_reward/std": 2.7366530895233154, + "rewards/kidney_reward/mean": 2.1773085594177246, + "rewards/kidney_reward/std": 1.4662457704544067, + "rewards/length2tails_reward/mean": 0.5935394763946533, + "rewards/length2tails_reward/std": 0.36415228247642517, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.340271472930908, + "rewards/thermo_reward/std": 2.4543654918670654, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.28125, + "completions/mean_terminated_length": 270.28125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10732367634773254, + "epoch": 0.718, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07978679984807968, + "learning_rate": 1.9230184675153973e-06, + "loss": -0.0057, + "num_tokens": 3113812.0, + "reward": 10.384027481079102, + "reward_std": 6.627580165863037, + "rewards/fitness_reward/mean": 6.309484481811523, + "rewards/fitness_reward/std": 3.155627727508545, + "rewards/kidney_reward/mean": 1.8931320905685425, + "rewards/kidney_reward/std": 1.6427026987075806, + "rewards/length2tails_reward/mean": 0.6456698179244995, + "rewards/length2tails_reward/std": 0.36801210045814514, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 2.0230941772460938, + "rewards/thermo_reward/std": 2.5650124549865723, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.59375, + "completions/mean_terminated_length": 270.59375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10083027090877295, + "epoch": 0.72, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0760909765958786, + "learning_rate": 1.9225243432974772e-06, + "loss": -0.0039, + "num_tokens": 3122503.0, + "reward": 11.693754196166992, + "reward_std": 3.8968091011047363, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.452592670917511, + "rewards/kidney_reward/mean": 2.171109199523926, + "rewards/kidney_reward/std": 1.2216949462890625, + "rewards/length2tails_reward/mean": 0.5941706299781799, + "rewards/length2tails_reward/std": 0.35726645588874817, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.117060661315918, + "rewards/thermo_reward/std": 2.537241220474243, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.5, + "completions/mean_terminated_length": 270.5, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10586319025605917, + "epoch": 0.722, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09383681416511536, + "learning_rate": 1.9220287022200707e-06, + "loss": -0.0049, + "num_tokens": 3131191.0, + "reward": 11.630992889404297, + "reward_std": 5.257022380828857, + "rewards/fitness_reward/mean": 6.733970642089844, + "rewards/fitness_reward/std": 2.468461513519287, + "rewards/kidney_reward/mean": 2.2140183448791504, + "rewards/kidney_reward/std": 1.1709821224212646, + "rewards/length2tails_reward/mean": 0.6167657375335693, + "rewards/length2tails_reward/std": 0.37922364473342896, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.521327257156372, + "rewards/thermo_reward/std": 2.263733148574829, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 273.625, + "completions/mean_terminated_length": 273.625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.11059287562966347, + "epoch": 0.724, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11690595746040344, + "learning_rate": 1.9215315450981336e-06, + "loss": -0.0029, + "num_tokens": 3139979.0, + "reward": 11.296387672424316, + "reward_std": 5.4516825675964355, + "rewards/fitness_reward/mean": 6.644530296325684, + "rewards/fitness_reward/std": 2.60485577583313, + "rewards/kidney_reward/mean": 2.0352516174316406, + "rewards/kidney_reward/std": 1.3362797498703003, + "rewards/length2tails_reward/mean": 0.7619137763977051, + "rewards/length2tails_reward/std": 0.327181875705719, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.4404149055480957, + "rewards/thermo_reward/std": 2.0719480514526367, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10017055552452803, + "epoch": 0.726, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16517867147922516, + "learning_rate": 1.9210328727491173e-06, + "loss": -0.002, + "num_tokens": 3148695.0, + "reward": 10.863248825073242, + "reward_std": 6.320736408233643, + "rewards/fitness_reward/mean": 6.588662147521973, + "rewards/fitness_reward/std": 2.817298412322998, + "rewards/kidney_reward/mean": 2.011671304702759, + "rewards/kidney_reward/std": 1.5752159357070923, + "rewards/length2tails_reward/mean": 0.651005744934082, + "rewards/length2tails_reward/std": 0.37302064895629883, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.0978143215179443, + "rewards/thermo_reward/std": 2.426593542098999, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.75, + "completions/mean_terminated_length": 269.75, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10572062712162733, + "epoch": 0.728, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5161871314048767, + "learning_rate": 1.920532685992962e-06, + "loss": -0.0005, + "num_tokens": 3157359.0, + "reward": 10.26416015625, + "reward_std": 6.127639293670654, + "rewards/fitness_reward/mean": 6.581520080566406, + "rewards/fitness_reward/std": 2.626322031021118, + "rewards/kidney_reward/mean": 1.7820661067962646, + "rewards/kidney_reward/std": 1.6363115310668945, + "rewards/length2tails_reward/mean": 0.553253173828125, + "rewards/length2tails_reward/std": 0.39293044805526733, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.745248794555664, + "rewards/thermo_reward/std": 2.6991000175476074, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.6875, + "completions/mean_terminated_length": 271.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1089264303445816, + "epoch": 0.73, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0864341259002686, + "learning_rate": 1.9200309856520996e-06, + "loss": -0.004, + "num_tokens": 3166085.0, + "reward": 12.792705535888672, + "reward_std": 2.40315580368042, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.394333600997925, + "rewards/kidney_reward/std": 0.8731848001480103, + "rewards/length2tails_reward/mean": 0.62489914894104, + "rewards/length2tails_reward/std": 0.3941367268562317, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.874697208404541, + "rewards/thermo_reward/std": 1.6326818466186523, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 269.78125, + "completions/mean_terminated_length": 269.78125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11107066180557013, + "epoch": 0.732, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11344432085752487, + "learning_rate": 1.9195277725514506e-06, + "loss": -0.0078, + "num_tokens": 3174750.0, + "reward": 11.069419860839844, + "reward_std": 4.182448387145996, + "rewards/fitness_reward/mean": 6.938035011291504, + "rewards/fitness_reward/std": 1.840762972831726, + "rewards/kidney_reward/mean": 2.2398624420166016, + "rewards/kidney_reward/std": 1.0455249547958374, + "rewards/length2tails_reward/mean": 0.5537076592445374, + "rewards/length2tails_reward/std": 0.387100487947464, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.7361513376235962, + "rewards/thermo_reward/std": 2.2292439937591553, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.9375, + "completions/mean_terminated_length": 271.9375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11792270187288523, + "epoch": 0.734, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.210047647356987, + "learning_rate": 1.9190230475184223e-06, + "loss": -0.0064, + "num_tokens": 3183484.0, + "reward": 11.371177673339844, + "reward_std": 4.733616828918457, + "rewards/fitness_reward/mean": 6.433873176574707, + "rewards/fitness_reward/std": 2.9292948246002197, + "rewards/kidney_reward/mean": 2.1523091793060303, + "rewards/kidney_reward/std": 0.984671413898468, + "rewards/length2tails_reward/mean": 0.7043337821960449, + "rewards/length2tails_reward/std": 0.3285147249698639, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.6145615577697754, + "rewards/thermo_reward/std": 1.4447458982467651, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.0, + "completions/mean_terminated_length": 271.0, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10720256343483925, + "epoch": 0.736, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11020854860544205, + "learning_rate": 1.9185168113829076e-06, + "loss": -0.0084, + "num_tokens": 3192188.0, + "reward": 11.898595809936523, + "reward_std": 2.5480897426605225, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.449726104736328, + "rewards/kidney_reward/std": 0.5813888907432556, + "rewards/length2tails_reward/mean": 0.6259234547615051, + "rewards/length2tails_reward/std": 0.3515353500843048, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.9826017618179321, + "rewards/thermo_reward/std": 2.1645750999450684, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.78125, + "completions/mean_terminated_length": 270.78125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10751251876354218, + "epoch": 0.738, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09786931425333023, + "learning_rate": 1.9180090649772854e-06, + "loss": -0.0014, + "num_tokens": 3200885.0, + "reward": 7.86342191696167, + "reward_std": 9.2483491897583, + "rewards/fitness_reward/mean": 4.914643287658691, + "rewards/fitness_reward/std": 4.593165397644043, + "rewards/kidney_reward/mean": 1.3660173416137695, + "rewards/kidney_reward/std": 2.202097177505493, + "rewards/length2tails_reward/mean": 0.645307183265686, + "rewards/length2tails_reward/std": 0.36554959416389465, + "rewards/repeated_in_batch_reward/mean": 0.90625, + "rewards/repeated_in_batch_reward/std": 0.2961445748806, + "rewards/thermo_reward/mean": 1.427605390548706, + "rewards/thermo_reward/std": 2.9483962059020996, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.59375, + "completions/mean_terminated_length": 271.59375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1021569799631834, + "epoch": 0.74, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18761053681373596, + "learning_rate": 1.9174998091364167e-06, + "loss": -0.0076, + "num_tokens": 3209608.0, + "reward": 9.419897079467773, + "reward_std": 7.5619916915893555, + "rewards/fitness_reward/mean": 6.109926223754883, + "rewards/fitness_reward/std": 3.258395195007324, + "rewards/kidney_reward/mean": 1.5979440212249756, + "rewards/kidney_reward/std": 2.025669574737549, + "rewards/length2tails_reward/mean": 0.6712652444839478, + "rewards/length2tails_reward/std": 0.37890389561653137, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.5449011325836182, + "rewards/thermo_reward/std": 2.8237903118133545, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.0625, + "completions/mean_terminated_length": 270.0625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10275615192949772, + "epoch": 0.742, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17681962251663208, + "learning_rate": 1.9169890446976452e-06, + "loss": -0.0044, + "num_tokens": 3218282.0, + "reward": 8.854126930236816, + "reward_std": 7.579990863800049, + "rewards/fitness_reward/mean": 5.729085922241211, + "rewards/fitness_reward/std": 3.860947608947754, + "rewards/kidney_reward/mean": 1.782204508781433, + "rewards/kidney_reward/std": 1.6522729396820068, + "rewards/length2tails_reward/mean": 0.5530422925949097, + "rewards/length2tails_reward/std": 0.39241811633110046, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 1.1937828063964844, + "rewards/thermo_reward/std": 3.0032665729522705, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.5, + "completions/mean_terminated_length": 270.5, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10601808875799179, + "epoch": 0.744, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1457470804452896, + "learning_rate": 1.916476772500794e-06, + "loss": -0.0058, + "num_tokens": 3226970.0, + "reward": 12.296627044677734, + "reward_std": 2.8065128326416016, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.430830955505371, + "rewards/kidney_reward/std": 0.6787682175636292, + "rewards/length2tails_reward/mean": 0.6328158378601074, + "rewards/length2tails_reward/std": 0.3681427538394928, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.398838520050049, + "rewards/thermo_reward/std": 2.134150981903076, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 262.84375, + "completions/mean_terminated_length": 262.84375, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "entropy": 0.10207627713680267, + "epoch": 0.746, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3557395040988922, + "learning_rate": 1.9159629933881667e-06, + "loss": -0.1238, + "num_tokens": 3235413.0, + "reward": 11.494861602783203, + "reward_std": 4.767448902130127, + "rewards/fitness_reward/mean": 6.9528045654296875, + "rewards/fitness_reward/std": 2.0009288787841797, + "rewards/kidney_reward/mean": 2.1724929809570312, + "rewards/kidney_reward/std": 1.3247390985488892, + "rewards/length2tails_reward/mean": 0.5239368677139282, + "rewards/length2tails_reward/std": 0.39487963914871216, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.2171709537506104, + "rewards/thermo_reward/std": 2.2351832389831543, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 265.09375, + "completions/mean_terminated_length": 265.09375, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.10754709504544735, + "epoch": 0.748, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6204034686088562, + "learning_rate": 1.9154477082045434e-06, + "loss": -0.0661, + "num_tokens": 3243928.0, + "reward": 8.130387306213379, + "reward_std": 8.025690078735352, + "rewards/fitness_reward/mean": 5.279109954833984, + "rewards/fitness_reward/std": 4.192902088165283, + "rewards/kidney_reward/mean": 1.303051233291626, + "rewards/kidney_reward/std": 2.06316876411438, + "rewards/length2tails_reward/mean": 0.6537559032440186, + "rewards/length2tails_reward/std": 0.36036211252212524, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.3828513622283936, + "rewards/thermo_reward/std": 2.7855420112609863, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.125, + "completions/mean_terminated_length": 271.125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10372466966509819, + "epoch": 0.75, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20361915230751038, + "learning_rate": 1.914930917797181e-06, + "loss": -0.0033, + "num_tokens": 3252636.0, + "reward": 9.382475852966309, + "reward_std": 8.167882919311523, + "rewards/fitness_reward/mean": 5.807817459106445, + "rewards/fitness_reward/std": 3.692617893218994, + "rewards/kidney_reward/mean": 1.5686777830123901, + "rewards/kidney_reward/std": 2.1148335933685303, + "rewards/length2tails_reward/mean": 0.6882462501525879, + "rewards/length2tails_reward/std": 0.33273714780807495, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 1.8434056043624878, + "rewards/thermo_reward/std": 2.7445905208587646, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11489119380712509, + "epoch": 0.752, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2641088962554932, + "learning_rate": 1.9144126230158124e-06, + "loss": -0.0028, + "num_tokens": 3261360.0, + "reward": 10.802129745483398, + "reward_std": 5.9253644943237305, + "rewards/fitness_reward/mean": 6.857111930847168, + "rewards/fitness_reward/std": 2.2262284755706787, + "rewards/kidney_reward/mean": 1.873659372329712, + "rewards/kidney_reward/std": 1.7289634943008423, + "rewards/length2tails_reward/mean": 0.6913347840309143, + "rewards/length2tails_reward/std": 0.3142402470111847, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.9022259712219238, + "rewards/thermo_reward/std": 2.595896005630493, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.5, + "completions/mean_terminated_length": 269.5, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.10804861318320036, + "epoch": 0.754, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3748019337654114, + "learning_rate": 1.913892824712642e-06, + "loss": -0.0373, + "num_tokens": 3270016.0, + "reward": 9.500389099121094, + "reward_std": 8.477140426635742, + "rewards/fitness_reward/mean": 5.897690773010254, + "rewards/fitness_reward/std": 3.9426934719085693, + "rewards/kidney_reward/mean": 1.6560649871826172, + "rewards/kidney_reward/std": 2.092923641204834, + "rewards/length2tails_reward/mean": 0.7651374936103821, + "rewards/length2tails_reward/std": 0.31899499893188477, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.770119547843933, + "rewards/thermo_reward/std": 2.8880178928375244, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.46875, + "completions/mean_terminated_length": 270.46875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10575198102742434, + "epoch": 0.756, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09856077283620834, + "learning_rate": 1.9133715237423485e-06, + "loss": -0.0107, + "num_tokens": 3278703.0, + "reward": 11.312349319458008, + "reward_std": 3.7839372158050537, + "rewards/fitness_reward/mean": 7.188657760620117, + "rewards/fitness_reward/std": 0.7179933190345764, + "rewards/kidney_reward/mean": 2.090399742126465, + "rewards/kidney_reward/std": 1.1582475900650024, + "rewards/length2tails_reward/mean": 0.5980393290519714, + "rewards/length2tails_reward/std": 0.3704410791397095, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.8734878301620483, + "rewards/thermo_reward/std": 2.3937807083129883, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.15625, + "completions/mean_terminated_length": 271.15625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10157476831227541, + "epoch": 0.758, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08300302922725677, + "learning_rate": 1.91284872096208e-06, + "loss": -0.0003, + "num_tokens": 3287412.0, + "reward": 10.58859920501709, + "reward_std": 5.9171247482299805, + "rewards/fitness_reward/mean": 6.681866645812988, + "rewards/fitness_reward/std": 2.685343027114868, + "rewards/kidney_reward/mean": 1.87894606590271, + "rewards/kidney_reward/std": 1.5297423601150513, + "rewards/length2tails_reward/mean": 0.6118855476379395, + "rewards/length2tails_reward/std": 0.3672679364681244, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.8665984869003296, + "rewards/thermo_reward/std": 2.7235138416290283, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.125, + "completions/mean_terminated_length": 271.125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10028427001088858, + "epoch": 0.76, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11919562518596649, + "learning_rate": 1.912324417231454e-06, + "loss": 0.0052, + "num_tokens": 3296120.0, + "reward": 8.564162254333496, + "reward_std": 9.746264457702637, + "rewards/fitness_reward/mean": 5.225951194763184, + "rewards/fitness_reward/std": 4.518463611602783, + "rewards/kidney_reward/mean": 1.458353042602539, + "rewards/kidney_reward/std": 2.350266933441162, + "rewards/length2tails_reward/mean": 0.6691723465919495, + "rewards/length2tails_reward/std": 0.3536635637283325, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.7129408121109009, + "rewards/thermo_reward/std": 3.088613748550415, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.09375, + "completions/mean_terminated_length": 272.09375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11146265547722578, + "epoch": 0.762, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2094751000404358, + "learning_rate": 1.9117986134125567e-06, + "loss": 0.0036, + "num_tokens": 3304859.0, + "reward": 11.463823318481445, + "reward_std": 6.830804824829102, + "rewards/fitness_reward/mean": 6.590934753417969, + "rewards/fitness_reward/std": 3.0370945930480957, + "rewards/kidney_reward/mean": 2.0719785690307617, + "rewards/kidney_reward/std": 1.6950889825820923, + "rewards/length2tails_reward/mean": 0.7252353429794312, + "rewards/length2tails_reward/std": 0.32576078176498413, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.628385543823242, + "rewards/thermo_reward/std": 2.4370651245117188, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 271.4375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11498783808201551, + "epoch": 0.764, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1258348971605301, + "learning_rate": 1.91127131036994e-06, + "loss": 0.0005, + "num_tokens": 3313577.0, + "reward": 10.639206886291504, + "reward_std": 6.834832668304443, + "rewards/fitness_reward/mean": 6.58353853225708, + "rewards/fitness_reward/std": 2.860297918319702, + "rewards/kidney_reward/mean": 1.8589577674865723, + "rewards/kidney_reward/std": 1.873213768005371, + "rewards/length2tails_reward/mean": 0.671999990940094, + "rewards/length2tails_reward/std": 0.32619988918304443, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.029510259628296, + "rewards/thermo_reward/std": 2.685245990753174, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.6875, + "completions/mean_terminated_length": 270.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10606464743614197, + "epoch": 0.766, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05109286680817604, + "learning_rate": 1.9107425089706216e-06, + "loss": -0.0047, + "num_tokens": 3322271.0, + "reward": 12.418526649475098, + "reward_std": 3.3464503288269043, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.389791488647461, + "rewards/kidney_reward/std": 0.7344213128089905, + "rewards/length2tails_reward/mean": 0.6418942213058472, + "rewards/length2tails_reward/std": 0.3552281856536865, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8114919662475586, + "rewards/thermo_reward/std": 1.5152373313903809, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.4375, + "completions/mean_terminated_length": 270.4375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11245891731232405, + "epoch": 0.768, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07938886433839798, + "learning_rate": 1.9102122100840824e-06, + "loss": 0.0009, + "num_tokens": 3330957.0, + "reward": 10.211996078491211, + "reward_std": 6.531548976898193, + "rewards/fitness_reward/mean": 5.886880874633789, + "rewards/fitness_reward/std": 3.6463463306427, + "rewards/kidney_reward/mean": 1.9389326572418213, + "rewards/kidney_reward/std": 1.5082110166549683, + "rewards/length2tails_reward/mean": 0.6228433847427368, + "rewards/length2tails_reward/std": 0.3479934334754944, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.223897933959961, + "rewards/thermo_reward/std": 2.420728921890259, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10842283070087433, + "epoch": 0.77, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1542162448167801, + "learning_rate": 1.9096804145822653e-06, + "loss": -0.0018, + "num_tokens": 3339701.0, + "reward": 11.679430961608887, + "reward_std": 4.462829113006592, + "rewards/fitness_reward/mean": 6.991600036621094, + "rewards/fitness_reward/std": 1.7847418785095215, + "rewards/kidney_reward/mean": 2.2454633712768555, + "rewards/kidney_reward/std": 1.1345385313034058, + "rewards/length2tails_reward/mean": 0.7023622989654541, + "rewards/length2tails_reward/std": 0.35093632340431213, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.2721309661865234, + "rewards/thermo_reward/std": 2.254243850708008, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.96875, + "completions/mean_terminated_length": 270.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10247174184769392, + "epoch": 0.772, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14088068902492523, + "learning_rate": 1.9091471233395747e-06, + "loss": 0.0021, + "num_tokens": 3348404.0, + "reward": 10.934767723083496, + "reward_std": 6.660598278045654, + "rewards/fitness_reward/mean": 6.2669243812561035, + "rewards/fitness_reward/std": 3.459750175476074, + "rewards/kidney_reward/mean": 2.1590218544006348, + "rewards/kidney_reward/std": 1.5328222513198853, + "rewards/length2tails_reward/mean": 0.6023874282836914, + "rewards/length2tails_reward/std": 0.37724438309669495, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.3485822677612305, + "rewards/thermo_reward/std": 2.554950475692749, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11835763789713383, + "epoch": 0.774, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6285804510116577, + "learning_rate": 1.9086123372328743e-06, + "loss": 0.0016, + "num_tokens": 3357135.0, + "reward": 11.003572463989258, + "reward_std": 6.561087608337402, + "rewards/fitness_reward/mean": 6.622054576873779, + "rewards/fitness_reward/std": 2.910947799682617, + "rewards/kidney_reward/mean": 2.1000421047210693, + "rewards/kidney_reward/std": 1.6415328979492188, + "rewards/length2tails_reward/mean": 0.7079252004623413, + "rewards/length2tails_reward/std": 0.2862622141838074, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.110682964324951, + "rewards/thermo_reward/std": 2.5230486392974854, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.8125, + "completions/mean_terminated_length": 271.8125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11549379862844944, + "epoch": 0.776, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20497001707553864, + "learning_rate": 1.9080760571414853e-06, + "loss": 0.0005, + "num_tokens": 3365865.0, + "reward": 11.762812614440918, + "reward_std": 3.7451224327087402, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.2056479454040527, + "rewards/kidney_reward/std": 0.9120680689811707, + "rewards/length2tails_reward/mean": 0.709801197052002, + "rewards/length2tails_reward/std": 0.3223036825656891, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.3331310749053955, + "rewards/thermo_reward/std": 1.9911855459213257, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.3125, + "completions/mean_terminated_length": 271.3125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10378851741552353, + "epoch": 0.778, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09145166724920273, + "learning_rate": 1.9075382839471853e-06, + "loss": -0.0065, + "num_tokens": 3374579.0, + "reward": 11.425798416137695, + "reward_std": 3.9687082767486572, + "rewards/fitness_reward/mean": 6.994507789611816, + "rewards/fitness_reward/std": 1.7685731649398804, + "rewards/kidney_reward/mean": 2.1150007247924805, + "rewards/kidney_reward/std": 1.1914541721343994, + "rewards/length2tails_reward/mean": 0.6470258235931396, + "rewards/length2tails_reward/std": 0.3857588469982147, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.151587963104248, + "rewards/thermo_reward/std": 2.378540277481079, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.59375, + "completions/mean_terminated_length": 270.59375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10269285459071398, + "epoch": 0.78, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1557752788066864, + "learning_rate": 1.9069990185342073e-06, + "loss": -0.0039, + "num_tokens": 3383270.0, + "reward": 10.588491439819336, + "reward_std": 5.960216045379639, + "rewards/fitness_reward/mean": 6.198329925537109, + "rewards/fitness_reward/std": 3.3503475189208984, + "rewards/kidney_reward/mean": 2.1871232986450195, + "rewards/kidney_reward/std": 1.2717788219451904, + "rewards/length2tails_reward/mean": 0.6088348627090454, + "rewards/length2tails_reward/std": 0.3780163526535034, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.0421547889709473, + "rewards/thermo_reward/std": 2.5141873359680176, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.78125, + "completions/mean_terminated_length": 270.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10547183640301228, + "epoch": 0.782, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12822791934013367, + "learning_rate": 1.906458261789238e-06, + "loss": 0.0014, + "num_tokens": 3391967.0, + "reward": 11.095891952514648, + "reward_std": 6.153520584106445, + "rewards/fitness_reward/mean": 6.617605209350586, + "rewards/fitness_reward/std": 2.7073137760162354, + "rewards/kidney_reward/mean": 2.1009960174560547, + "rewards/kidney_reward/std": 1.5732152462005615, + "rewards/length2tails_reward/mean": 0.6067447662353516, + "rewards/length2tails_reward/std": 0.3467532694339752, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.216616153717041, + "rewards/thermo_reward/std": 2.4701502323150635, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 271.3125, + "completions/mean_terminated_length": 271.3125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10272944066673517, + "epoch": 0.784, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06617357581853867, + "learning_rate": 1.905916014601416e-06, + "loss": -0.0051, + "num_tokens": 3400681.0, + "reward": 11.105353355407715, + "reward_std": 5.018069744110107, + "rewards/fitness_reward/mean": 6.5859375, + "rewards/fitness_reward/std": 2.8291032314300537, + "rewards/kidney_reward/mean": 2.181995153427124, + "rewards/kidney_reward/std": 1.045772671699524, + "rewards/length2tails_reward/mean": 0.6834899187088013, + "rewards/length2tails_reward/std": 0.3619054853916168, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.169072151184082, + "rewards/thermo_reward/std": 2.2546708583831787, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.1875, + "completions/mean_terminated_length": 271.1875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10134368669241667, + "epoch": 0.786, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12521199882030487, + "learning_rate": 1.9053722778623303e-06, + "loss": -0.0058, + "num_tokens": 3409391.0, + "reward": 10.825408935546875, + "reward_std": 5.762734413146973, + "rewards/fitness_reward/mean": 6.575164794921875, + "rewards/fitness_reward/std": 2.658289670944214, + "rewards/kidney_reward/mean": 2.0606515407562256, + "rewards/kidney_reward/std": 1.5197820663452148, + "rewards/length2tails_reward/mean": 0.6492632627487183, + "rewards/length2tails_reward/std": 0.3747102618217468, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.0246660709381104, + "rewards/thermo_reward/std": 2.344879627227783, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.78125, + "completions/mean_terminated_length": 270.78125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11077911220490932, + "epoch": 0.788, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09796229749917984, + "learning_rate": 1.9048270524660196e-06, + "loss": -0.0025, + "num_tokens": 3418088.0, + "reward": 10.854618072509766, + "reward_std": 5.408969879150391, + "rewards/fitness_reward/mean": 6.669445991516113, + "rewards/fitness_reward/std": 2.502680540084839, + "rewards/kidney_reward/mean": 2.0772039890289307, + "rewards/kidney_reward/std": 1.2458466291427612, + "rewards/length2tails_reward/mean": 0.6614983081817627, + "rewards/length2tails_reward/std": 0.337128609418869, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.9418175220489502, + "rewards/thermo_reward/std": 2.4592199325561523, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.46875, + "completions/mean_terminated_length": 269.46875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09399615973234177, + "epoch": 0.79, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17700549960136414, + "learning_rate": 1.9042803393089697e-06, + "loss": -0.0078, + "num_tokens": 3426743.0, + "reward": 10.897236824035645, + "reward_std": 6.1277265548706055, + "rewards/fitness_reward/mean": 6.1432342529296875, + "rewards/fitness_reward/std": 3.33506441116333, + "rewards/kidney_reward/mean": 2.0403788089752197, + "rewards/kidney_reward/std": 1.4263689517974854, + "rewards/length2tails_reward/mean": 0.5339868068695068, + "rewards/length2tails_reward/std": 0.36498579382896423, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.560225009918213, + "rewards/thermo_reward/std": 2.0816314220428467, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.0625, + "completions/mean_terminated_length": 272.0625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09640540461987257, + "epoch": 0.792, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05166810750961304, + "learning_rate": 1.9037321392901133e-06, + "loss": -0.0002, + "num_tokens": 3435481.0, + "reward": 11.834596633911133, + "reward_std": 4.384562969207764, + "rewards/fitness_reward/mean": 7.010133266448975, + "rewards/fitness_reward/std": 1.9858490228652954, + "rewards/kidney_reward/mean": 2.284642219543457, + "rewards/kidney_reward/std": 1.0346956253051758, + "rewards/length2tails_reward/mean": 0.6679773330688477, + "rewards/length2tails_reward/std": 0.3801604211330414, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.373023748397827, + "rewards/thermo_reward/std": 1.8982181549072266, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 270.65625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10857865959405899, + "epoch": 0.794, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10565046966075897, + "learning_rate": 1.9031824533108277e-06, + "loss": -0.0042, + "num_tokens": 3444174.0, + "reward": 11.789709091186523, + "reward_std": 4.957429885864258, + "rewards/fitness_reward/mean": 6.921901226043701, + "rewards/fitness_reward/std": 1.926413655281067, + "rewards/kidney_reward/mean": 2.1898322105407715, + "rewards/kidney_reward/std": 1.3164691925048828, + "rewards/length2tails_reward/mean": 0.6206187605857849, + "rewards/length2tails_reward/std": 0.3860568404197693, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.51591420173645, + "rewards/thermo_reward/std": 2.3009755611419678, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 273.0, + "completions/mean_terminated_length": 273.0, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10428552236407995, + "epoch": 0.796, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9575762152671814, + "learning_rate": 1.9026312822749331e-06, + "loss": 0.0176, + "num_tokens": 3452942.0, + "reward": 10.670546531677246, + "reward_std": 5.758018970489502, + "rewards/fitness_reward/mean": 6.642149925231934, + "rewards/fitness_reward/std": 2.614790201187134, + "rewards/kidney_reward/mean": 1.78218674659729, + "rewards/kidney_reward/std": 1.4918310642242432, + "rewards/length2tails_reward/mean": 0.6436346769332886, + "rewards/length2tails_reward/std": 0.33519431948661804, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.0818464756011963, + "rewards/thermo_reward/std": 2.5326437950134277, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.9375, + "completions/mean_terminated_length": 270.9375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11991294380277395, + "epoch": 0.798, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10349400341510773, + "learning_rate": 1.902078627088692e-06, + "loss": 0.002, + "num_tokens": 3461644.0, + "reward": 11.47077751159668, + "reward_std": 5.410183429718018, + "rewards/fitness_reward/mean": 6.689011573791504, + "rewards/fitness_reward/std": 2.654714345932007, + "rewards/kidney_reward/mean": 2.2360825538635254, + "rewards/kidney_reward/std": 1.1814593076705933, + "rewards/length2tails_reward/mean": 0.6174130439758301, + "rewards/length2tails_reward/std": 0.3640681803226471, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.3839423656463623, + "rewards/thermo_reward/std": 2.1441264152526855, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.09375, + "completions/mean_terminated_length": 272.09375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.11858357861638069, + "epoch": 0.8, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11466143280267715, + "learning_rate": 1.9015244886608068e-06, + "loss": -0.0015, + "num_tokens": 3470383.0, + "reward": 11.97624397277832, + "reward_std": 3.287630081176758, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.224416971206665, + "rewards/kidney_reward/std": 1.1031237840652466, + "rewards/length2tails_reward/mean": 0.7445105314254761, + "rewards/length2tails_reward/std": 0.2538454234600067, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.2736995220184326, + "rewards/thermo_reward/std": 2.2119221687316895, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.34375, + "completions/mean_terminated_length": 269.34375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09968282468616962, + "epoch": 0.802, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17062100768089294, + "learning_rate": 1.9009688679024189e-06, + "loss": -0.0031, + "num_tokens": 3479034.0, + "reward": 10.030776977539062, + "reward_std": 6.229125499725342, + "rewards/fitness_reward/mean": 6.334397315979004, + "rewards/fitness_reward/std": 3.067559003829956, + "rewards/kidney_reward/mean": 1.9138509035110474, + "rewards/kidney_reward/std": 1.6205497980117798, + "rewards/length2tails_reward/mean": 0.5182558298110962, + "rewards/length2tails_reward/std": 0.36070799827575684, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 1.6369529962539673, + "rewards/thermo_reward/std": 2.5060691833496094, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.4375, + "completions/mean_terminated_length": 270.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0993448942899704, + "epoch": 0.804, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06334397196769714, + "learning_rate": 1.9004117657271075e-06, + "loss": -0.004, + "num_tokens": 3487720.0, + "reward": 12.601202964782715, + "reward_std": 2.6943719387054443, + "rewards/fitness_reward/mean": 7.011618614196777, + "rewards/fitness_reward/std": 1.977447509765625, + "rewards/kidney_reward/mean": 2.4896838665008545, + "rewards/kidney_reward/std": 0.3229711949825287, + "rewards/length2tails_reward/mean": 0.6432620882987976, + "rewards/length2tails_reward/std": 0.30116140842437744, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9355742931365967, + "rewards/thermo_reward/std": 1.4401322603225708, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.59375, + "completions/mean_terminated_length": 270.59375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10525709856301546, + "epoch": 0.806, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21384942531585693, + "learning_rate": 1.8998531830508867e-06, + "loss": -0.001, + "num_tokens": 3496411.0, + "reward": 11.023578643798828, + "reward_std": 5.368097305297852, + "rewards/fitness_reward/mean": 6.832054615020752, + "rewards/fitness_reward/std": 2.111349105834961, + "rewards/kidney_reward/mean": 2.1173434257507324, + "rewards/kidney_reward/std": 1.3727282285690308, + "rewards/length2tails_reward/mean": 0.6243403553962708, + "rewards/length2tails_reward/std": 0.3703277111053467, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.911745548248291, + "rewards/thermo_reward/std": 2.5675079822540283, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10209081880748272, + "epoch": 0.808, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0930095762014389, + "learning_rate": 1.8992931207922051e-06, + "loss": -0.0076, + "num_tokens": 3505127.0, + "reward": 10.85621452331543, + "reward_std": 5.562469482421875, + "rewards/fitness_reward/mean": 6.39210319519043, + "rewards/fitness_reward/std": 2.7849199771881104, + "rewards/kidney_reward/mean": 1.9636497497558594, + "rewards/kidney_reward/std": 1.3856197595596313, + "rewards/length2tails_reward/mean": 0.6451936960220337, + "rewards/length2tails_reward/std": 0.38377857208251953, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.335941791534424, + "rewards/thermo_reward/std": 2.192247152328491, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.46875, + "completions/mean_terminated_length": 271.46875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10685683414340019, + "epoch": 0.81, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.40007176995277405, + "learning_rate": 1.898731579871945e-06, + "loss": -0.0077, + "num_tokens": 3513846.0, + "reward": 11.469308853149414, + "reward_std": 4.350868225097656, + "rewards/fitness_reward/mean": 6.99554443359375, + "rewards/fitness_reward/std": 1.7628074884414673, + "rewards/kidney_reward/mean": 2.1276307106018066, + "rewards/kidney_reward/std": 1.243864893913269, + "rewards/length2tails_reward/mean": 0.6337643265724182, + "rewards/length2tails_reward/std": 0.39094334840774536, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.1827573776245117, + "rewards/thermo_reward/std": 2.2920548915863037, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.375, + "completions/mean_terminated_length": 269.375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09846197813749313, + "epoch": 0.812, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05986107513308525, + "learning_rate": 1.898168561213419e-06, + "loss": -0.0042, + "num_tokens": 3522498.0, + "reward": 11.358301162719727, + "reward_std": 4.575360298156738, + "rewards/fitness_reward/mean": 6.622168064117432, + "rewards/fitness_reward/std": 2.699398994445801, + "rewards/kidney_reward/mean": 2.2430858612060547, + "rewards/kidney_reward/std": 0.880042552947998, + "rewards/length2tails_reward/mean": 0.5045244097709656, + "rewards/length2tails_reward/std": 0.38825201988220215, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.342595100402832, + "rewards/thermo_reward/std": 2.359964370727539, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.46875, + "completions/mean_terminated_length": 271.46875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10626904852688313, + "epoch": 0.814, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.049734681844711304, + "learning_rate": 1.89760406574237e-06, + "loss": -0.0031, + "num_tokens": 3531217.0, + "reward": 11.872875213623047, + "reward_std": 5.345208644866943, + "rewards/fitness_reward/mean": 6.691218376159668, + "rewards/fitness_reward/std": 2.6453068256378174, + "rewards/kidney_reward/mean": 2.2744810581207275, + "rewards/kidney_reward/std": 1.1397809982299805, + "rewards/length2tails_reward/mean": 0.7228773236274719, + "rewards/length2tails_reward/std": 0.28869134187698364, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.7348880767822266, + "rewards/thermo_reward/std": 1.9927328824996948, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.3125, + "completions/mean_terminated_length": 272.3125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09195891162380576, + "epoch": 0.816, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.42579376697540283, + "learning_rate": 1.8970380943869686e-06, + "loss": -0.0075, + "num_tokens": 3539963.0, + "reward": 11.169445037841797, + "reward_std": 5.160699844360352, + "rewards/fitness_reward/mean": 6.524928092956543, + "rewards/fitness_reward/std": 2.845306873321533, + "rewards/kidney_reward/mean": 2.268484115600586, + "rewards/kidney_reward/std": 1.0355554819107056, + "rewards/length2tails_reward/mean": 0.6663411855697632, + "rewards/length2tails_reward/std": 0.3943096101284027, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.2093992233276367, + "rewards/thermo_reward/std": 2.1658506393432617, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.28125, + "completions/mean_terminated_length": 271.28125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10751304030418396, + "epoch": 0.818, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07456594705581665, + "learning_rate": 1.8964706480778127e-06, + "loss": -0.0086, + "num_tokens": 3548676.0, + "reward": 11.711389541625977, + "reward_std": 3.914869785308838, + "rewards/fitness_reward/mean": 6.703976631164551, + "rewards/fitness_reward/std": 2.5914342403411865, + "rewards/kidney_reward/mean": 2.4296531677246094, + "rewards/kidney_reward/std": 0.5578335523605347, + "rewards/length2tails_reward/mean": 0.6096435785293579, + "rewards/length2tails_reward/std": 0.3954150378704071, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.4167962074279785, + "rewards/thermo_reward/std": 1.7974330186843872, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.96875, + "completions/mean_terminated_length": 269.96875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09460800653323531, + "epoch": 0.82, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05850926786661148, + "learning_rate": 1.8959017277479254e-06, + "loss": -0.0032, + "num_tokens": 3557347.0, + "reward": 10.729347229003906, + "reward_std": 6.073248386383057, + "rewards/fitness_reward/mean": 5.974654674530029, + "rewards/fitness_reward/std": 3.7373361587524414, + "rewards/kidney_reward/mean": 2.17683482170105, + "rewards/kidney_reward/std": 1.2341636419296265, + "rewards/length2tails_reward/mean": 0.5170494318008423, + "rewards/length2tails_reward/std": 0.4222230613231659, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.4261536598205566, + "rewards/thermo_reward/std": 2.2774081230163574, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.34375, + "completions/mean_terminated_length": 270.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10826176684349775, + "epoch": 0.822, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05722399801015854, + "learning_rate": 1.8953313343327531e-06, + "loss": -0.0024, + "num_tokens": 3566030.0, + "reward": 11.544689178466797, + "reward_std": 5.336467266082764, + "rewards/fitness_reward/mean": 6.698562145233154, + "rewards/fitness_reward/std": 2.614190101623535, + "rewards/kidney_reward/mean": 2.185883045196533, + "rewards/kidney_reward/std": 1.2772369384765625, + "rewards/length2tails_reward/mean": 0.6247825622558594, + "rewards/length2tails_reward/std": 0.3542427718639374, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.4977660179138184, + "rewards/thermo_reward/std": 2.0525569915771484, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.78125, + "completions/mean_terminated_length": 271.78125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10833274759352207, + "epoch": 0.824, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07414238899946213, + "learning_rate": 1.8947594687701643e-06, + "loss": -0.004, + "num_tokens": 3574759.0, + "reward": 12.215715408325195, + "reward_std": 3.1908395290374756, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.303682327270508, + "rewards/kidney_reward/std": 0.9398621916770935, + "rewards/length2tails_reward/mean": 0.68268221616745, + "rewards/length2tails_reward/std": 0.3193387985229492, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.440089225769043, + "rewards/thermo_reward/std": 2.3476831912994385, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.875, + "completions/mean_terminated_length": 271.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11643747333437204, + "epoch": 0.826, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.159814715385437, + "learning_rate": 1.8941861320004482e-06, + "loss": -0.0062, + "num_tokens": 3583491.0, + "reward": 12.556082725524902, + "reward_std": 2.924164295196533, + "rewards/fitness_reward/mean": 7.188657283782959, + "rewards/fitness_reward/std": 0.7179933190345764, + "rewards/kidney_reward/mean": 2.3408851623535156, + "rewards/kidney_reward/std": 0.9379902482032776, + "rewards/length2tails_reward/mean": 0.7260844707489014, + "rewards/length2tails_reward/std": 0.3202032148838043, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.853931427001953, + "rewards/thermo_reward/std": 1.5523806810379028, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.875, + "completions/mean_terminated_length": 271.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11242737621068954, + "epoch": 0.828, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06392483413219452, + "learning_rate": 1.8936113249663134e-06, + "loss": -0.0045, + "num_tokens": 3592223.0, + "reward": 12.918121337890625, + "reward_std": 2.424635171890259, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4072585105895996, + "rewards/kidney_reward/std": 0.6688947677612305, + "rewards/length2tails_reward/mean": 0.7507193088531494, + "rewards/length2tails_reward/std": 0.28658536076545715, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.974606513977051, + "rewards/thermo_reward/std": 1.8415789604187012, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.78125, + "completions/mean_terminated_length": 269.78125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10316573828458786, + "epoch": 0.83, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07275240123271942, + "learning_rate": 1.8930350486128855e-06, + "loss": -0.0017, + "num_tokens": 3600888.0, + "reward": 12.80959701538086, + "reward_std": 1.577688217163086, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.5662388205528259, + "rewards/length2tails_reward/std": 0.3575478494167328, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.7473864555358887, + "rewards/thermo_reward/std": 1.4778509140014648, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.15625, + "completions/mean_terminated_length": 271.15625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10222028940916061, + "epoch": 0.832, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08466058224439621, + "learning_rate": 1.8924573038877059e-06, + "loss": -0.0026, + "num_tokens": 3609597.0, + "reward": 11.92800521850586, + "reward_std": 4.447513103485107, + "rewards/fitness_reward/mean": 6.99554443359375, + "rewards/fitness_reward/std": 1.7628074884414673, + "rewards/kidney_reward/mean": 2.132324695587158, + "rewards/kidney_reward/std": 1.3145263195037842, + "rewards/length2tails_reward/mean": 0.6122592687606812, + "rewards/length2tails_reward/std": 0.39825892448425293, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.6389098167419434, + "rewards/thermo_reward/std": 2.201195478439331, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.3125, + "completions/mean_terminated_length": 271.3125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11040330212563276, + "epoch": 0.834, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0990830585360527, + "learning_rate": 1.8918780917407306e-06, + "loss": -0.0082, + "num_tokens": 3618311.0, + "reward": 11.632905960083008, + "reward_std": 3.737865686416626, + "rewards/fitness_reward/mean": 7.010110378265381, + "rewards/fitness_reward/std": 1.9859774112701416, + "rewards/kidney_reward/mean": 2.2957663536071777, + "rewards/kidney_reward/std": 0.7961881756782532, + "rewards/length2tails_reward/mean": 0.6458245515823364, + "rewards/length2tails_reward/std": 0.3643723130226135, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.1624460220336914, + "rewards/thermo_reward/std": 2.50215744972229, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.90625, + "completions/mean_terminated_length": 271.90625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10497548617422581, + "epoch": 0.836, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26866573095321655, + "learning_rate": 1.891297413124329e-06, + "loss": -0.0002, + "num_tokens": 3627044.0, + "reward": 11.439022064208984, + "reward_std": 5.316377639770508, + "rewards/fitness_reward/mean": 6.693381309509277, + "rewards/fitness_reward/std": 2.6331963539123535, + "rewards/kidney_reward/mean": 2.306674003601074, + "rewards/kidney_reward/std": 1.155663251876831, + "rewards/length2tails_reward/mean": 0.6525619029998779, + "rewards/length2tails_reward/std": 0.3838775157928467, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.273709774017334, + "rewards/thermo_reward/std": 1.9739047288894653, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10293256212025881, + "epoch": 0.838, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.45388221740722656, + "learning_rate": 1.8907152689932807e-06, + "loss": 0.0008, + "num_tokens": 3635768.0, + "reward": 11.114225387573242, + "reward_std": 6.007777214050293, + "rewards/fitness_reward/mean": 6.650113105773926, + "rewards/fitness_reward/std": 2.7990036010742188, + "rewards/kidney_reward/mean": 2.1136951446533203, + "rewards/kidney_reward/std": 1.413481593132019, + "rewards/length2tails_reward/mean": 0.7098208665847778, + "rewards/length2tails_reward/std": 0.31065788865089417, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.1794347763061523, + "rewards/thermo_reward/std": 2.332662343978882, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.5, + "completions/mean_terminated_length": 271.5, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10914828535169363, + "epoch": 0.84, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17354260385036469, + "learning_rate": 1.890131660304776e-06, + "loss": 0.0035, + "num_tokens": 3644488.0, + "reward": 12.097772598266602, + "reward_std": 4.210140228271484, + "rewards/fitness_reward/mean": 6.984278202056885, + "rewards/fitness_reward/std": 1.8254834413528442, + "rewards/kidney_reward/mean": 2.3367323875427246, + "rewards/kidney_reward/std": 0.9451751112937927, + "rewards/length2tails_reward/mean": 0.648028552532196, + "rewards/length2tails_reward/std": 0.35677388310432434, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.611959934234619, + "rewards/thermo_reward/std": 1.979071021080017, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.53125, + "completions/mean_terminated_length": 270.53125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11126773897558451, + "epoch": 0.842, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11031896620988846, + "learning_rate": 1.8895465880184118e-06, + "loss": -0.009, + "num_tokens": 3653177.0, + "reward": 11.771194458007812, + "reward_std": 4.116332054138184, + "rewards/fitness_reward/mean": 6.938035011291504, + "rewards/fitness_reward/std": 1.78042471408844, + "rewards/kidney_reward/mean": 2.236675262451172, + "rewards/kidney_reward/std": 0.9763497710227966, + "rewards/length2tails_reward/mean": 0.6223554611206055, + "rewards/length2tails_reward/std": 0.3492722809314728, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.4342474937438965, + "rewards/thermo_reward/std": 2.1130752563476562, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.375, + "completions/mean_terminated_length": 273.375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.10812357719987631, + "epoch": 0.844, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08260457962751389, + "learning_rate": 1.8889600530961932e-06, + "loss": -0.0046, + "num_tokens": 3661957.0, + "reward": 11.629140853881836, + "reward_std": 5.307247161865234, + "rewards/fitness_reward/mean": 6.631275177001953, + "rewards/fitness_reward/std": 2.872642993927002, + "rewards/kidney_reward/mean": 2.2064032554626465, + "rewards/kidney_reward/std": 1.2581707239151, + "rewards/length2tails_reward/mean": 0.8156726956367493, + "rewards/length2tails_reward/std": 0.2643541395664215, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.6098947525024414, + "rewards/thermo_reward/std": 2.1621108055114746, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.125, + "completions/mean_terminated_length": 270.125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09654233139008284, + "epoch": 0.846, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10825493186712265, + "learning_rate": 1.8883720565025295e-06, + "loss": -0.0025, + "num_tokens": 3670633.0, + "reward": 12.276029586791992, + "reward_std": 3.112684965133667, + "rewards/fitness_reward/mean": 7.188657283782959, + "rewards/fitness_reward/std": 0.7179933190345764, + "rewards/kidney_reward/mean": 2.2949612140655518, + "rewards/kidney_reward/std": 0.9728565216064453, + "rewards/length2tails_reward/mean": 0.5613535642623901, + "rewards/length2tails_reward/std": 0.39826834201812744, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.6362757682800293, + "rewards/thermo_reward/std": 1.831773281097412, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.59375, + "completions/mean_terminated_length": 269.59375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1035864856094122, + "epoch": 0.848, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05640365555882454, + "learning_rate": 1.8877825992042328e-06, + "loss": -0.0017, + "num_tokens": 3679292.0, + "reward": 11.372427940368652, + "reward_std": 5.877617359161377, + "rewards/fitness_reward/mean": 6.392179489135742, + "rewards/fitness_reward/std": 3.06811785697937, + "rewards/kidney_reward/mean": 2.106853485107422, + "rewards/kidney_reward/std": 1.2987583875656128, + "rewards/length2tails_reward/mean": 0.5478678941726685, + "rewards/length2tails_reward/std": 0.38686081767082214, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 2.724857807159424, + "rewards/thermo_reward/std": 1.903943419456482, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 270.34375, + "completions/mean_terminated_length": 270.34375, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "entropy": 0.11908046249300241, + "epoch": 0.85, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3277480900287628, + "learning_rate": 1.8871916821705183e-06, + "loss": -0.02, + "num_tokens": 3687975.0, + "reward": 10.093684196472168, + "reward_std": 7.310795783996582, + "rewards/fitness_reward/mean": 6.2110700607299805, + "rewards/fitness_reward/std": 3.644062042236328, + "rewards/kidney_reward/mean": 1.734727144241333, + "rewards/kidney_reward/std": 1.906247854232788, + "rewards/length2tails_reward/mean": 0.6923362016677856, + "rewards/length2tails_reward/std": 0.3460744321346283, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.9786536693572998, + "rewards/thermo_reward/std": 2.5642237663269043, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.78125, + "completions/mean_terminated_length": 270.78125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10314168641343713, + "epoch": 0.852, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08059486001729965, + "learning_rate": 1.8865993063730002e-06, + "loss": 0.0004, + "num_tokens": 3696672.0, + "reward": 9.494028091430664, + "reward_std": 7.22643518447876, + "rewards/fitness_reward/mean": 6.156418323516846, + "rewards/fitness_reward/std": 3.2921860218048096, + "rewards/kidney_reward/mean": 1.6028504371643066, + "rewards/kidney_reward/std": 1.8587509393692017, + "rewards/length2tails_reward/mean": 0.5948251485824585, + "rewards/length2tails_reward/std": 0.38797110319137573, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.5752768516540527, + "rewards/thermo_reward/std": 2.861532211303711, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.15625, + "completions/mean_terminated_length": 272.15625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10947284288704395, + "epoch": 0.854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24476461112499237, + "learning_rate": 1.8860054727856914e-06, + "loss": -0.0029, + "num_tokens": 3705413.0, + "reward": 11.377592086791992, + "reward_std": 5.323113441467285, + "rewards/fitness_reward/mean": 6.946459770202637, + "rewards/fitness_reward/std": 2.0363516807556152, + "rewards/kidney_reward/mean": 2.0672290325164795, + "rewards/kidney_reward/std": 1.5073118209838867, + "rewards/length2tails_reward/mean": 0.755418598651886, + "rewards/length2tails_reward/std": 0.28897804021835327, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.1883621215820312, + "rewards/thermo_reward/std": 2.3532443046569824, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.78125, + "completions/mean_terminated_length": 272.78125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.1341212745755911, + "epoch": 0.856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10159477591514587, + "learning_rate": 1.8854101823850024e-06, + "loss": 0.0015, + "num_tokens": 3714174.0, + "reward": 11.991580963134766, + "reward_std": 5.1352057456970215, + "rewards/fitness_reward/mean": 7.010758399963379, + "rewards/fitness_reward/std": 1.9823126792907715, + "rewards/kidney_reward/mean": 2.2044568061828613, + "rewards/kidney_reward/std": 1.312869668006897, + "rewards/length2tails_reward/mean": 0.7862224578857422, + "rewards/length2tails_reward/std": 0.2628347873687744, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.5977439880371094, + "rewards/thermo_reward/std": 2.285738229751587, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11356043815612793, + "epoch": 0.858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09840331226587296, + "learning_rate": 1.8848134361497382e-06, + "loss": -0.0028, + "num_tokens": 3722890.0, + "reward": 11.219654083251953, + "reward_std": 4.991817951202393, + "rewards/fitness_reward/mean": 6.613970756530762, + "rewards/fitness_reward/std": 2.7284114360809326, + "rewards/kidney_reward/mean": 2.212526559829712, + "rewards/kidney_reward/std": 1.1607743501663208, + "rewards/length2tails_reward/mean": 0.6491613984107971, + "rewards/length2tails_reward/std": 0.35882139205932617, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.2282400131225586, + "rewards/thermo_reward/std": 2.2548983097076416, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.3125, + "completions/mean_terminated_length": 271.3125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11092049349099398, + "epoch": 0.86, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18348531424999237, + "learning_rate": 1.884215235061099e-06, + "loss": -0.006, + "num_tokens": 3731604.0, + "reward": 9.037184715270996, + "reward_std": 6.7496137619018555, + "rewards/fitness_reward/mean": 5.8980865478515625, + "rewards/fitness_reward/std": 3.6536824703216553, + "rewards/kidney_reward/mean": 1.7923259735107422, + "rewards/kidney_reward/std": 1.5743216276168823, + "rewards/length2tails_reward/mean": 0.6611948609352112, + "rewards/length2tails_reward/std": 0.36291196942329407, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 1.1869029998779297, + "rewards/thermo_reward/std": 2.8278138637542725, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10679224226623774, + "epoch": 0.862, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.059091147035360336, + "learning_rate": 1.8836155801026753e-06, + "loss": -0.0057, + "num_tokens": 3740362.0, + "reward": 10.398284912109375, + "reward_std": 6.299036502838135, + "rewards/fitness_reward/mean": 6.153982639312744, + "rewards/fitness_reward/std": 3.443929433822632, + "rewards/kidney_reward/mean": 1.9206444025039673, + "rewards/kidney_reward/std": 1.4578499794006348, + "rewards/length2tails_reward/mean": 0.7474456429481506, + "rewards/length2tails_reward/std": 0.3091128468513489, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.148913860321045, + "rewards/thermo_reward/std": 2.431601047515869, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11343036219477654, + "epoch": 0.864, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05083763599395752, + "learning_rate": 1.883014472260449e-06, + "loss": 0.002, + "num_tokens": 3749106.0, + "reward": 12.751008987426758, + "reward_std": 4.422357082366943, + "rewards/fitness_reward/mean": 7.004498481750488, + "rewards/fitness_reward/std": 2.01772403717041, + "rewards/kidney_reward/mean": 2.3845763206481934, + "rewards/kidney_reward/std": 1.0707193613052368, + "rewards/length2tails_reward/mean": 0.7510284781455994, + "rewards/length2tails_reward/std": 0.30357375741004944, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.186831474304199, + "rewards/thermo_reward/std": 1.4339048862457275, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.375, + "completions/mean_terminated_length": 269.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10790925193578005, + "epoch": 0.866, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.052672725170850754, + "learning_rate": 1.8824119125227917e-06, + "loss": -0.0052, + "num_tokens": 3757758.0, + "reward": 12.130245208740234, + "reward_std": 3.5245840549468994, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.4843716621398926, + "rewards/kidney_reward/std": 0.5299732089042664, + "rewards/length2tails_reward/mean": 0.5352551937103271, + "rewards/length2tails_reward/std": 0.34461134672164917, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.4392945766448975, + "rewards/thermo_reward/std": 2.1182706356048584, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11035188753157854, + "epoch": 0.868, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10949714481830597, + "learning_rate": 1.881807901880461e-06, + "loss": 0.0003, + "num_tokens": 3766479.0, + "reward": 11.700475692749023, + "reward_std": 4.995792865753174, + "rewards/fitness_reward/mean": 6.950691223144531, + "rewards/fitness_reward/std": 2.012726306915283, + "rewards/kidney_reward/mean": 2.1191606521606445, + "rewards/kidney_reward/std": 1.2957903146743774, + "rewards/length2tails_reward/mean": 0.6825649738311768, + "rewards/length2tails_reward/std": 0.33565008640289307, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.4623684883117676, + "rewards/thermo_reward/std": 2.338465929031372, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.375, + "completions/mean_terminated_length": 270.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11510983109474182, + "epoch": 0.87, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09583006799221039, + "learning_rate": 1.8812024413266004e-06, + "loss": -0.0006, + "num_tokens": 3775163.0, + "reward": 10.369340896606445, + "reward_std": 6.81846809387207, + "rewards/fitness_reward/mean": 6.297909736633301, + "rewards/fitness_reward/std": 3.1902825832366943, + "rewards/kidney_reward/mean": 1.9326624870300293, + "rewards/kidney_reward/std": 1.5834283828735352, + "rewards/length2tails_reward/mean": 0.6140122413635254, + "rewards/length2tails_reward/std": 0.34732601046562195, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.9773672819137573, + "rewards/thermo_reward/std": 2.6822171211242676, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.4375, + "completions/mean_terminated_length": 272.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.12347112130373716, + "epoch": 0.872, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08026017993688583, + "learning_rate": 1.8805955318567379e-06, + "loss": -0.0037, + "num_tokens": 3783913.0, + "reward": 11.330899238586426, + "reward_std": 3.2819528579711914, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.019801616668701, + "rewards/kidney_reward/std": 1.1881566047668457, + "rewards/length2tails_reward/mean": 0.6870349049568176, + "rewards/length2tails_reward/std": 0.3882122337818146, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.7812089920043945, + "rewards/thermo_reward/std": 2.2675986289978027, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.03125, + "completions/mean_terminated_length": 271.03125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10257836803793907, + "epoch": 0.874, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06457014381885529, + "learning_rate": 1.8799871744687837e-06, + "loss": -0.0016, + "num_tokens": 3792618.0, + "reward": 11.247159957885742, + "reward_std": 5.1798176765441895, + "rewards/fitness_reward/mean": 6.952747344970703, + "rewards/fitness_reward/std": 2.001246929168701, + "rewards/kidney_reward/mean": 2.117042303085327, + "rewards/kidney_reward/std": 1.360007405281067, + "rewards/length2tails_reward/mean": 0.6416828632354736, + "rewards/length2tails_reward/std": 0.35022541880607605, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.013201951980591, + "rewards/thermo_reward/std": 2.656627655029297, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.0625, + "completions/mean_terminated_length": 271.0625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10436887294054031, + "epoch": 0.876, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07170294970273972, + "learning_rate": 1.8793773701630285e-06, + "loss": -0.0082, + "num_tokens": 3801324.0, + "reward": 11.48604965209961, + "reward_std": 4.0768280029296875, + "rewards/fitness_reward/mean": 6.872337341308594, + "rewards/fitness_reward/std": 2.1419677734375, + "rewards/kidney_reward/mean": 2.178575038909912, + "rewards/kidney_reward/std": 0.9928116202354431, + "rewards/length2tails_reward/mean": 0.6542539596557617, + "rewards/length2tails_reward/std": 0.36290809512138367, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.269711494445801, + "rewards/thermo_reward/std": 2.394803524017334, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.53125, + "completions/mean_terminated_length": 270.53125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10206220205873251, + "epoch": 0.878, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.499766081571579, + "learning_rate": 1.8787661199421426e-06, + "loss": -0.0035, + "num_tokens": 3810013.0, + "reward": 10.417783737182617, + "reward_std": 6.227560520172119, + "rewards/fitness_reward/mean": 6.27559757232666, + "rewards/fitness_reward/std": 3.4502835273742676, + "rewards/kidney_reward/mean": 2.142082691192627, + "rewards/kidney_reward/std": 1.3962165117263794, + "rewards/length2tails_reward/mean": 0.6051323413848877, + "rewards/length2tails_reward/std": 0.37149810791015625, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.8395893573760986, + "rewards/thermo_reward/std": 2.511199474334717, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.8125, + "completions/mean_terminated_length": 271.8125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10686526820063591, + "epoch": 0.88, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06103542819619179, + "learning_rate": 1.8781534248111729e-06, + "loss": -0.0041, + "num_tokens": 3818743.0, + "reward": 12.375520706176758, + "reward_std": 3.433697462081909, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.36761474609375, + "rewards/kidney_reward/std": 0.7219849228858948, + "rewards/length2tails_reward/mean": 0.7102646827697754, + "rewards/length2tails_reward/std": 0.3061424493789673, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.7838258743286133, + "rewards/thermo_reward/std": 1.7129298448562622, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 271.75, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10807423712685704, + "epoch": 0.882, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08479100465774536, + "learning_rate": 1.877539285777543e-06, + "loss": -0.0083, + "num_tokens": 3827471.0, + "reward": 11.815866470336914, + "reward_std": 5.167867183685303, + "rewards/fitness_reward/mean": 6.632122039794922, + "rewards/fitness_reward/std": 2.65669584274292, + "rewards/kidney_reward/mean": 2.2621984481811523, + "rewards/kidney_reward/std": 1.1790883541107178, + "rewards/length2tails_reward/mean": 0.6838822960853577, + "rewards/length2tails_reward/std": 0.3464617431163788, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.7531585693359375, + "rewards/thermo_reward/std": 1.9235625267028809, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10133147519081831, + "epoch": 0.884, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07655313611030579, + "learning_rate": 1.8769237038510499e-06, + "loss": -0.0091, + "num_tokens": 3836187.0, + "reward": 12.801630020141602, + "reward_std": 2.2215499877929688, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.464538097381592, + "rewards/kidney_reward/std": 0.6341493725776672, + "rewards/length2tails_reward/mean": 0.6683378219604492, + "rewards/length2tails_reward/std": 0.3719833195209503, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.809072971343994, + "rewards/thermo_reward/std": 1.7491624355316162, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.65625, + "completions/mean_terminated_length": 271.65625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11100908927619457, + "epoch": 0.886, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06725804507732391, + "learning_rate": 1.8763066800438634e-06, + "loss": -0.0025, + "num_tokens": 3844912.0, + "reward": 11.207784652709961, + "reward_std": 5.085540294647217, + "rewards/fitness_reward/mean": 6.943241119384766, + "rewards/fitness_reward/std": 2.0543272495269775, + "rewards/kidney_reward/mean": 2.169588804244995, + "rewards/kidney_reward/std": 1.2773592472076416, + "rewards/length2tails_reward/mean": 0.6841601133346558, + "rewards/length2tails_reward/std": 0.33557799458503723, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.9265384674072266, + "rewards/thermo_reward/std": 2.5070431232452393, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 271.21875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10447764955461025, + "epoch": 0.888, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06931206583976746, + "learning_rate": 1.8756882153705246e-06, + "loss": -0.0056, + "num_tokens": 3853623.0, + "reward": 12.738012313842773, + "reward_std": 2.0027472972869873, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4896838665008545, + "rewards/kidney_reward/std": 0.3229711949825287, + "rewards/length2tails_reward/mean": 0.6159868836402893, + "rewards/length2tails_reward/std": 0.3990537226200104, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.7255449295043945, + "rewards/thermo_reward/std": 1.750189185142517, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.9375, + "completions/mean_terminated_length": 270.9375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10720862448215485, + "epoch": 0.89, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0804607942700386, + "learning_rate": 1.875068310847943e-06, + "loss": -0.0008, + "num_tokens": 3862325.0, + "reward": 12.044584274291992, + "reward_std": 4.262816905975342, + "rewards/fitness_reward/mean": 6.969001293182373, + "rewards/fitness_reward/std": 1.910581111907959, + "rewards/kidney_reward/mean": 2.3637166023254395, + "rewards/kidney_reward/std": 0.8406797647476196, + "rewards/length2tails_reward/mean": 0.7061450481414795, + "rewards/length2tails_reward/std": 0.27908092737197876, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.5412521362304688, + "rewards/thermo_reward/std": 2.0376970767974854, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.125, + "completions/mean_terminated_length": 270.125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09836816508322954, + "epoch": 0.892, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07819455862045288, + "learning_rate": 1.8744469674953955e-06, + "loss": -0.0041, + "num_tokens": 3871001.0, + "reward": 9.899242401123047, + "reward_std": 6.037757873535156, + "rewards/fitness_reward/mean": 6.267643928527832, + "rewards/fitness_reward/std": 3.281162738800049, + "rewards/kidney_reward/mean": 1.7312264442443848, + "rewards/kidney_reward/std": 1.5176829099655151, + "rewards/length2tails_reward/mean": 0.5764222145080566, + "rewards/length2tails_reward/std": 0.3620404303073883, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.7427294254302979, + "rewards/thermo_reward/std": 2.74582576751709, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.15625, + "completions/mean_terminated_length": 271.15625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09943460021167994, + "epoch": 0.894, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15047623217105865, + "learning_rate": 1.873824186334526e-06, + "loss": -0.0073, + "num_tokens": 3879710.0, + "reward": 11.746528625488281, + "reward_std": 4.381689071655273, + "rewards/fitness_reward/mean": 7.131148338317871, + "rewards/fitness_reward/std": 0.905185341835022, + "rewards/kidney_reward/mean": 2.1221461296081543, + "rewards/kidney_reward/std": 1.507962703704834, + "rewards/length2tails_reward/mean": 0.6461377143859863, + "rewards/length2tails_reward/std": 0.3861064612865448, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.3286209106445312, + "rewards/thermo_reward/std": 2.35248064994812, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11172295361757278, + "epoch": 0.896, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06506041437387466, + "learning_rate": 1.8731999683893402e-06, + "loss": -0.0047, + "num_tokens": 3888445.0, + "reward": 12.793821334838867, + "reward_std": 3.550959348678589, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.3735175132751465, + "rewards/kidney_reward/std": 0.917577862739563, + "rewards/length2tails_reward/mean": 0.7215828895568848, + "rewards/length2tails_reward/std": 0.27914682030677795, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.19509220123291, + "rewards/thermo_reward/std": 1.4694974422454834, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.8125, + "completions/mean_terminated_length": 270.8125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1053133811801672, + "epoch": 0.898, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08816291391849518, + "learning_rate": 1.8725743146862092e-06, + "loss": 0.0001, + "num_tokens": 3897143.0, + "reward": 11.757261276245117, + "reward_std": 5.4524030685424805, + "rewards/fitness_reward/mean": 6.697388172149658, + "rewards/fitness_reward/std": 2.6181633472442627, + "rewards/kidney_reward/mean": 2.312431812286377, + "rewards/kidney_reward/std": 1.1747411489486694, + "rewards/length2tails_reward/mean": 0.6832621097564697, + "rewards/length2tails_reward/std": 0.32060670852661133, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.5791144371032715, + "rewards/thermo_reward/std": 2.3686468601226807, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.78125, + "completions/mean_terminated_length": 271.78125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10911222640424967, + "epoch": 0.9, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.053479693830013275, + "learning_rate": 1.8719472262538622e-06, + "loss": 0.0004, + "num_tokens": 3905872.0, + "reward": 12.228736877441406, + "reward_std": 4.497715950012207, + "rewards/fitness_reward/mean": 7.005762577056885, + "rewards/fitness_reward/std": 2.010572910308838, + "rewards/kidney_reward/mean": 2.346081495285034, + "rewards/kidney_reward/std": 0.9915664196014404, + "rewards/length2tails_reward/mean": 0.6802853941917419, + "rewards/length2tails_reward/std": 0.34215471148490906, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.708864688873291, + "rewards/thermo_reward/std": 2.0118680000305176, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 265.15625, + "completions/mean_terminated_length": 265.15625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.10868939571082592, + "epoch": 0.902, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2735053300857544, + "learning_rate": 1.8713187041233893e-06, + "loss": -0.0532, + "num_tokens": 3914389.0, + "reward": 9.818794250488281, + "reward_std": 7.534875392913818, + "rewards/fitness_reward/mean": 6.295016765594482, + "rewards/fitness_reward/std": 3.373291492462158, + "rewards/kidney_reward/mean": 1.7130271196365356, + "rewards/kidney_reward/std": 1.9210028648376465, + "rewards/length2tails_reward/mean": 0.6132369041442871, + "rewards/length2tails_reward/std": 0.3320391774177551, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.6494271755218506, + "rewards/thermo_reward/std": 3.0261270999908447, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.0, + "completions/mean_terminated_length": 271.0, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11821938399225473, + "epoch": 0.904, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1313985288143158, + "learning_rate": 1.8706887493282366e-06, + "loss": 0.004, + "num_tokens": 3923093.0, + "reward": 10.177400588989258, + "reward_std": 7.999275207519531, + "rewards/fitness_reward/mean": 6.244592666625977, + "rewards/fitness_reward/std": 3.542152166366577, + "rewards/kidney_reward/mean": 1.8024885654449463, + "rewards/kidney_reward/std": 2.027366876602173, + "rewards/length2tails_reward/mean": 0.6615628600120544, + "rewards/length2tails_reward/std": 0.3624088764190674, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.9641624689102173, + "rewards/thermo_reward/std": 2.8660004138946533, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.71875, + "completions/mean_terminated_length": 270.71875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10027664620429277, + "epoch": 0.906, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07662325352430344, + "learning_rate": 1.870057362904207e-06, + "loss": -0.0087, + "num_tokens": 3931788.0, + "reward": 11.619586944580078, + "reward_std": 3.6709110736846924, + "rewards/fitness_reward/mean": 6.9872660636901855, + "rewards/fitness_reward/std": 2.1152055263519287, + "rewards/kidney_reward/mean": 2.3769898414611816, + "rewards/kidney_reward/std": 0.6920126080513, + "rewards/length2tails_reward/mean": 0.5850013494491577, + "rewards/length2tails_reward/std": 0.3896300792694092, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.0968310832977295, + "rewards/thermo_reward/std": 2.492100477218628, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.5625, + "completions/mean_terminated_length": 269.5625, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.1027941033244133, + "epoch": 0.908, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.073139987885952, + "learning_rate": 1.8694245458894566e-06, + "loss": 0.0003, + "num_tokens": 3940446.0, + "reward": 10.70131778717041, + "reward_std": 5.772634506225586, + "rewards/fitness_reward/mean": 6.5864458084106445, + "rewards/fitness_reward/std": 2.652723550796509, + "rewards/kidney_reward/mean": 2.006622552871704, + "rewards/kidney_reward/std": 1.4546256065368652, + "rewards/length2tails_reward/mean": 0.5887947082519531, + "rewards/length2tails_reward/std": 0.41531050205230713, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.9493696689605713, + "rewards/thermo_reward/std": 2.536822557449341, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12032953463494778, + "epoch": 0.91, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10334499925374985, + "learning_rate": 1.8687902993244942e-06, + "loss": -0.0033, + "num_tokens": 3949167.0, + "reward": 10.728096008300781, + "reward_std": 6.511604309082031, + "rewards/fitness_reward/mean": 6.469539642333984, + "rewards/fitness_reward/std": 2.875175952911377, + "rewards/kidney_reward/mean": 2.025391101837158, + "rewards/kidney_reward/std": 1.6085196733474731, + "rewards/length2tails_reward/mean": 0.6913998126983643, + "rewards/length2tails_reward/std": 0.308725506067276, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.0640251636505127, + "rewards/thermo_reward/std": 2.5514793395996094, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.15625, + "completions/mean_terminated_length": 271.15625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11203371826559305, + "epoch": 0.912, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09344188123941422, + "learning_rate": 1.8681546242521783e-06, + "loss": 0.0019, + "num_tokens": 3957876.0, + "reward": 9.808435440063477, + "reward_std": 6.910869598388672, + "rewards/fitness_reward/mean": 6.19857120513916, + "rewards/fitness_reward/std": 3.487494945526123, + "rewards/kidney_reward/mean": 1.8561313152313232, + "rewards/kidney_reward/std": 1.8377562761306763, + "rewards/length2tails_reward/mean": 0.6736408472061157, + "rewards/length2tails_reward/std": 0.3606426417827606, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.5863685607910156, + "rewards/thermo_reward/std": 2.6774237155914307, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1099884919822216, + "epoch": 0.914, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1273649036884308, + "learning_rate": 1.8675175217177175e-06, + "loss": -0.0006, + "num_tokens": 3966611.0, + "reward": 10.460234642028809, + "reward_std": 6.559492111206055, + "rewards/fitness_reward/mean": 6.5706024169921875, + "rewards/fitness_reward/std": 2.889678716659546, + "rewards/kidney_reward/mean": 1.9447031021118164, + "rewards/kidney_reward/std": 1.5729787349700928, + "rewards/length2tails_reward/mean": 0.7160842418670654, + "rewards/length2tails_reward/std": 0.3384973108768463, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.7733216285705566, + "rewards/thermo_reward/std": 2.7137043476104736, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.875, + "completions/mean_terminated_length": 269.875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10272444318979979, + "epoch": 0.916, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.060355983674526215, + "learning_rate": 1.8668789927686666e-06, + "loss": -0.0059, + "num_tokens": 3975279.0, + "reward": 11.540103912353516, + "reward_std": 3.569697618484497, + "rewards/fitness_reward/mean": 6.937399864196777, + "rewards/fitness_reward/std": 1.8441245555877686, + "rewards/kidney_reward/mean": 2.296140432357788, + "rewards/kidney_reward/std": 0.7964839935302734, + "rewards/length2tails_reward/mean": 0.5815209746360779, + "rewards/length2tails_reward/std": 0.32663068175315857, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.148411273956299, + "rewards/thermo_reward/std": 2.218916654586792, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 272.15625, + "completions/mean_terminated_length": 272.15625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10958459321409464, + "epoch": 0.918, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0612795390188694, + "learning_rate": 1.866239038454926e-06, + "loss": -0.0016, + "num_tokens": 3984020.0, + "reward": 12.074222564697266, + "reward_std": 4.837175369262695, + "rewards/fitness_reward/mean": 7.007224082946777, + "rewards/fitness_reward/std": 2.002307653427124, + "rewards/kidney_reward/mean": 2.2945916652679443, + "rewards/kidney_reward/std": 1.1536694765090942, + "rewards/length2tails_reward/mean": 0.7487285733222961, + "rewards/length2tails_reward/std": 0.2750574052333832, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.5975348949432373, + "rewards/thermo_reward/std": 2.3417248725891113, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.75, + "completions/mean_terminated_length": 269.75, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09708154760301113, + "epoch": 0.92, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14464029669761658, + "learning_rate": 1.8655976598287392e-06, + "loss": -0.0104, + "num_tokens": 3992684.0, + "reward": 11.042967796325684, + "reward_std": 5.450899124145508, + "rewards/fitness_reward/mean": 6.508005142211914, + "rewards/fitness_reward/std": 2.7292606830596924, + "rewards/kidney_reward/mean": 2.1265482902526855, + "rewards/kidney_reward/std": 1.3590309619903564, + "rewards/length2tails_reward/mean": 0.5516175627708435, + "rewards/length2tails_reward/std": 0.41006430983543396, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.2532529830932617, + "rewards/thermo_reward/std": 2.283200740814209, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 270.40625, + "completions/mean_terminated_length": 270.40625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10907396487891674, + "epoch": 0.922, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07392656803131104, + "learning_rate": 1.8649548579446935e-06, + "loss": -0.0046, + "num_tokens": 4001369.0, + "reward": 11.953383445739746, + "reward_std": 3.702449321746826, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.452592670917511, + "rewards/kidney_reward/mean": 2.1486942768096924, + "rewards/kidney_reward/std": 1.3303357362747192, + "rewards/length2tails_reward/mean": 0.6268943548202515, + "rewards/length2tails_reward/std": 0.36906784772872925, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.3958330154418945, + "rewards/thermo_reward/std": 2.2485578060150146, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 271.4375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10220075491815805, + "epoch": 0.924, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07153013348579407, + "learning_rate": 1.864310633859714e-06, + "loss": -0.0018, + "num_tokens": 4010087.0, + "reward": 11.94033145904541, + "reward_std": 4.323536396026611, + "rewards/fitness_reward/mean": 6.880142688751221, + "rewards/fitness_reward/std": 1.8578017950057983, + "rewards/kidney_reward/mean": 2.155754804611206, + "rewards/kidney_reward/std": 0.9834696054458618, + "rewards/length2tails_reward/mean": 0.6571721434593201, + "rewards/length2tails_reward/std": 0.33760565519332886, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.7387170791625977, + "rewards/thermo_reward/std": 2.0518715381622314, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.0625, + "completions/mean_terminated_length": 273.0625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10953015740960836, + "epoch": 0.926, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08538804203271866, + "learning_rate": 1.863664988633066e-06, + "loss": -0.0035, + "num_tokens": 4018857.0, + "reward": 10.794061660766602, + "reward_std": 5.991631507873535, + "rewards/fitness_reward/mean": 6.6039018630981445, + "rewards/fitness_reward/std": 2.7577898502349854, + "rewards/kidney_reward/mean": 2.080371856689453, + "rewards/kidney_reward/std": 1.446594476699829, + "rewards/length2tails_reward/mean": 0.7289041876792908, + "rewards/length2tails_reward/std": 0.3512548506259918, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.9368979930877686, + "rewards/thermo_reward/std": 2.4265477657318115, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.9375, + "completions/mean_terminated_length": 270.9375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11466868966817856, + "epoch": 0.928, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05370505526661873, + "learning_rate": 1.86301792332635e-06, + "loss": 0.0004, + "num_tokens": 4027559.0, + "reward": 12.159685134887695, + "reward_std": 4.429521560668945, + "rewards/fitness_reward/mean": 6.945412635803223, + "rewards/fitness_reward/std": 2.042198896408081, + "rewards/kidney_reward/mean": 2.3425214290618896, + "rewards/kidney_reward/std": 1.0108206272125244, + "rewards/length2tails_reward/mean": 0.6667732000350952, + "rewards/length2tails_reward/std": 0.3295728862285614, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.7050743103027344, + "rewards/thermo_reward/std": 1.7441182136535645, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 271.75, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10749402642250061, + "epoch": 0.93, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06493767350912094, + "learning_rate": 1.8623694390035035e-06, + "loss": -0.0022, + "num_tokens": 4036287.0, + "reward": 11.909021377563477, + "reward_std": 4.883798599243164, + "rewards/fitness_reward/mean": 7.002554893493652, + "rewards/fitness_reward/std": 2.0287187099456787, + "rewards/kidney_reward/mean": 2.2224979400634766, + "rewards/kidney_reward/std": 1.2641041278839111, + "rewards/length2tails_reward/mean": 0.6873390674591064, + "rewards/length2tails_reward/std": 0.3392084836959839, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.51523494720459, + "rewards/thermo_reward/std": 2.2844624519348145, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.4375, + "completions/mean_terminated_length": 272.4375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.111768938601017, + "epoch": 0.932, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14316429197788239, + "learning_rate": 1.8617195367307949e-06, + "loss": -0.0057, + "num_tokens": 4045037.0, + "reward": 12.788331985473633, + "reward_std": 3.252232551574707, + "rewards/fitness_reward/mean": 6.863459587097168, + "rewards/fitness_reward/std": 2.2403881549835205, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.7575904130935669, + "rewards/length2tails_reward/std": 0.31032228469848633, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.204712152481079, + "rewards/thermo_reward/std": 0.9641280770301819, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.3125, + "completions/mean_terminated_length": 271.3125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11796944122761488, + "epoch": 0.934, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09694632142782211, + "learning_rate": 1.8610682175768257e-06, + "loss": -0.0057, + "num_tokens": 4053751.0, + "reward": 12.549504280090332, + "reward_std": 2.293038845062256, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.499530553817749, + "rewards/kidney_reward/std": 0.5794037580490112, + "rewards/length2tails_reward/mean": 0.7065576314926147, + "rewards/length2tails_reward/std": 0.29977843165397644, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.5756421089172363, + "rewards/thermo_reward/std": 1.6526124477386475, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.0625, + "completions/mean_terminated_length": 273.0625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11116079892963171, + "epoch": 0.936, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09160441905260086, + "learning_rate": 1.8604154826125268e-06, + "loss": -0.004, + "num_tokens": 4062521.0, + "reward": 11.810023307800293, + "reward_std": 5.807432651519775, + "rewards/fitness_reward/mean": 6.940349102020264, + "rewards/fitness_reward/std": 2.380608320236206, + "rewards/kidney_reward/mean": 2.253354072570801, + "rewards/kidney_reward/std": 1.6274490356445312, + "rewards/length2tails_reward/mean": 0.7608587741851807, + "rewards/length2tails_reward/std": 0.31576141715049744, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.4402339458465576, + "rewards/thermo_reward/std": 2.2192366123199463, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.9375, + "completions/mean_terminated_length": 273.9375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09761633444577456, + "epoch": 0.938, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05027596279978752, + "learning_rate": 1.8597613329111566e-06, + "loss": -0.0023, + "num_tokens": 4071319.0, + "reward": 11.971752166748047, + "reward_std": 4.021405220031738, + "rewards/fitness_reward/mean": 6.9682512283325195, + "rewards/fitness_reward/std": 1.9147640466690063, + "rewards/kidney_reward/mean": 2.3168246746063232, + "rewards/kidney_reward/std": 0.8652597069740295, + "rewards/length2tails_reward/mean": 0.7702042460441589, + "rewards/length2tails_reward/std": 0.32408878207206726, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.509655475616455, + "rewards/thermo_reward/std": 1.7250123023986816, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 266.15625, + "completions/mean_terminated_length": 266.15625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.10231568291783333, + "epoch": 0.94, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22313550114631653, + "learning_rate": 1.8591057695483e-06, + "loss": -0.0678, + "num_tokens": 4079868.0, + "reward": 9.78188705444336, + "reward_std": 8.069836616516113, + "rewards/fitness_reward/mean": 5.98593807220459, + "rewards/fitness_reward/std": 3.7097551822662354, + "rewards/kidney_reward/mean": 1.5625232458114624, + "rewards/kidney_reward/std": 2.1295108795166016, + "rewards/length2tails_reward/mean": 0.7279834747314453, + "rewards/length2tails_reward/std": 0.35069504380226135, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.0606276988983154, + "rewards/thermo_reward/std": 2.832650899887085, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.0625, + "completions/mean_terminated_length": 271.0625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09819444455206394, + "epoch": 0.942, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07939817011356354, + "learning_rate": 1.858448793601866e-06, + "loss": -0.0048, + "num_tokens": 4088574.0, + "reward": 12.200660705566406, + "reward_std": 3.8725593090057373, + "rewards/fitness_reward/mean": 6.886867046356201, + "rewards/fitness_reward/std": 2.0616986751556396, + "rewards/kidney_reward/mean": 2.4161300659179688, + "rewards/kidney_reward/std": 0.6241863369941711, + "rewards/length2tails_reward/mean": 0.6536673903465271, + "rewards/length2tails_reward/std": 0.3719845116138458, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.732297420501709, + "rewards/thermo_reward/std": 2.0418496131896973, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.0, + "completions/mean_terminated_length": 271.0, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11007297784090042, + "epoch": 0.944, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06128634884953499, + "learning_rate": 1.8577904061520866e-06, + "loss": -0.0081, + "num_tokens": 4097278.0, + "reward": 12.243531227111816, + "reward_std": 3.5722525119781494, + "rewards/fitness_reward/mean": 6.99554443359375, + "rewards/fitness_reward/std": 1.7628074884414673, + "rewards/kidney_reward/mean": 2.3417837619781494, + "rewards/kidney_reward/std": 0.7228731513023376, + "rewards/length2tails_reward/mean": 0.6581995487213135, + "rewards/length2tails_reward/std": 0.339339941740036, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.7403831481933594, + "rewards/thermo_reward/std": 1.9604158401489258, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.84375, + "completions/mean_terminated_length": 269.84375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09462772868573666, + "epoch": 0.946, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09483262896537781, + "learning_rate": 1.8571306082815148e-06, + "loss": -0.0077, + "num_tokens": 4105945.0, + "reward": 10.221860885620117, + "reward_std": 4.868222713470459, + "rewards/fitness_reward/mean": 6.629333972930908, + "rewards/fitness_reward/std": 2.439204692840576, + "rewards/kidney_reward/mean": 2.050110340118408, + "rewards/kidney_reward/std": 1.2794262170791626, + "rewards/length2tails_reward/mean": 0.554818868637085, + "rewards/length2tails_reward/std": 0.39995256066322327, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.3869357109069824, + "rewards/thermo_reward/std": 2.6503026485443115, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.4375, + "completions/mean_terminated_length": 270.4375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10915269795805216, + "epoch": 0.948, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1461164355278015, + "learning_rate": 1.8564694010750221e-06, + "loss": 0.0002, + "num_tokens": 4114631.0, + "reward": 10.527725219726562, + "reward_std": 6.852277755737305, + "rewards/fitness_reward/mean": 6.522241592407227, + "rewards/fitness_reward/std": 2.871361255645752, + "rewards/kidney_reward/mean": 1.8803939819335938, + "rewards/kidney_reward/std": 1.8421525955200195, + "rewards/length2tails_reward/mean": 0.5725621581077576, + "rewards/length2tails_reward/std": 0.3809102475643158, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.967833161354065, + "rewards/thermo_reward/std": 2.5932414531707764, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.0, + "completions/mean_terminated_length": 271.0, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10617079585790634, + "epoch": 0.95, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16131462156772614, + "learning_rate": 1.8558067856197975e-06, + "loss": -0.0002, + "num_tokens": 4123335.0, + "reward": 11.356301307678223, + "reward_std": 4.680373668670654, + "rewards/fitness_reward/mean": 7.0524444580078125, + "rewards/fitness_reward/std": 1.746500015258789, + "rewards/kidney_reward/mean": 2.194664478302002, + "rewards/kidney_reward/std": 1.2206449508666992, + "rewards/length2tails_reward/mean": 0.6265060901641846, + "rewards/length2tails_reward/std": 0.3382713198661804, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.9465417861938477, + "rewards/thermo_reward/std": 2.584482431411743, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.0, + "completions/mean_terminated_length": 270.0, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10205241851508617, + "epoch": 0.952, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1406603455543518, + "learning_rate": 1.8551427630053463e-06, + "loss": -0.0054, + "num_tokens": 4132007.0, + "reward": 12.318782806396484, + "reward_std": 2.5967447757720947, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.3722710609436035, + "rewards/kidney_reward/std": 0.8007427453994751, + "rewards/length2tails_reward/mean": 0.6015236973762512, + "rewards/length2tails_reward/std": 0.35340017080307007, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.4826831817626953, + "rewards/thermo_reward/std": 1.8363468647003174, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.90625, + "completions/mean_terminated_length": 270.90625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10897734574973583, + "epoch": 0.954, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07693850249052048, + "learning_rate": 1.8544773343234858e-06, + "loss": -0.0035, + "num_tokens": 4140708.0, + "reward": 11.850772857666016, + "reward_std": 4.4592084884643555, + "rewards/fitness_reward/mean": 6.938035011291504, + "rewards/fitness_reward/std": 1.840762972831726, + "rewards/kidney_reward/mean": 2.3154757022857666, + "rewards/kidney_reward/std": 1.0696473121643066, + "rewards/length2tails_reward/mean": 0.626863956451416, + "rewards/length2tails_reward/std": 0.38042908906936646, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.4345757961273193, + "rewards/thermo_reward/std": 2.333374261856079, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.4375, + "completions/mean_terminated_length": 270.4375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10131760127842426, + "epoch": 0.956, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07668693363666534, + "learning_rate": 1.853810500668347e-06, + "loss": -0.0044, + "num_tokens": 4149394.0, + "reward": 12.450238227844238, + "reward_std": 2.4274275302886963, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.423727035522461, + "rewards/kidney_reward/std": 0.5866073369979858, + "rewards/length2tails_reward/mean": 0.6329134106636047, + "rewards/length2tails_reward/std": 0.32143205404281616, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.502035617828369, + "rewards/thermo_reward/std": 2.041130542755127, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.10201771184802055, + "epoch": 0.958, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08321689069271088, + "learning_rate": 1.8531422631363704e-06, + "loss": -0.003, + "num_tokens": 4158110.0, + "reward": 12.717397689819336, + "reward_std": 2.4608662128448486, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4727680683135986, + "rewards/kidney_reward/std": 0.7264254689216614, + "rewards/length2tails_reward/mean": 0.6925871968269348, + "rewards/length2tails_reward/std": 0.31042250990867615, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.7141854763031006, + "rewards/thermo_reward/std": 1.9494096040725708, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.625, + "completions/mean_terminated_length": 270.625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1034108754247427, + "epoch": 0.96, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05778651311993599, + "learning_rate": 1.8524726228263044e-06, + "loss": -0.0086, + "num_tokens": 4166802.0, + "reward": 11.463024139404297, + "reward_std": 4.595519065856934, + "rewards/fitness_reward/mean": 6.678974151611328, + "rewards/fitness_reward/std": 2.4648430347442627, + "rewards/kidney_reward/mean": 2.2850759029388428, + "rewards/kidney_reward/std": 1.023033618927002, + "rewards/length2tails_reward/mean": 0.5975102782249451, + "rewards/length2tails_reward/std": 0.3732609152793884, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.339221954345703, + "rewards/thermo_reward/std": 2.1146011352539062, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.59375, + "completions/mean_terminated_length": 269.59375, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "entropy": 0.11340959277004004, + "epoch": 0.962, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6065387725830078, + "learning_rate": 1.8518015808392043e-06, + "loss": -0.0232, + "num_tokens": 4175461.0, + "reward": 10.408863067626953, + "reward_std": 7.76609992980957, + "rewards/fitness_reward/mean": 6.237054824829102, + "rewards/fitness_reward/std": 3.5606303215026855, + "rewards/kidney_reward/mean": 1.810880184173584, + "rewards/kidney_reward/std": 1.9630531072616577, + "rewards/length2tails_reward/mean": 0.7209354639053345, + "rewards/length2tails_reward/std": 0.31308096647262573, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.1888349056243896, + "rewards/thermo_reward/std": 2.5315160751342773, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.03125, + "completions/mean_terminated_length": 272.03125, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "entropy": 0.10557975806295872, + "epoch": 0.964, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5803879499435425, + "learning_rate": 1.8511291382784297e-06, + "loss": -0.0099, + "num_tokens": 4184198.0, + "reward": 11.030176162719727, + "reward_std": 6.811028003692627, + "rewards/fitness_reward/mean": 6.34883975982666, + "rewards/fitness_reward/std": 3.2230067253112793, + "rewards/kidney_reward/mean": 2.0069994926452637, + "rewards/kidney_reward/std": 1.6315367221832275, + "rewards/length2tails_reward/mean": 0.7537314891815186, + "rewards/length2tails_reward/std": 0.3153182864189148, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.4989633560180664, + "rewards/thermo_reward/std": 2.3130509853363037, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.71875, + "completions/mean_terminated_length": 269.71875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09722129534929991, + "epoch": 0.966, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09880959242582321, + "learning_rate": 1.850455296249644e-06, + "loss": -0.0078, + "num_tokens": 4192861.0, + "reward": 10.829471588134766, + "reward_std": 4.066839218139648, + "rewards/fitness_reward/mean": 6.840295791625977, + "rewards/fitness_reward/std": 2.014033079147339, + "rewards/kidney_reward/mean": 2.231708526611328, + "rewards/kidney_reward/std": 0.9333588480949402, + "rewards/length2tails_reward/mean": 0.514818012714386, + "rewards/length2tails_reward/std": 0.397733211517334, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.6059857606887817, + "rewards/thermo_reward/std": 2.648850202560425, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.28125, + "completions/mean_terminated_length": 272.28125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10204268898814917, + "epoch": 0.968, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09644491225481033, + "learning_rate": 1.849780055860811e-06, + "loss": -0.0005, + "num_tokens": 4201606.0, + "reward": 12.977973937988281, + "reward_std": 1.523888349533081, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.517043113708496, + "rewards/kidney_reward/std": 0.2941751182079315, + "rewards/length2tails_reward/mean": 0.7408137917518616, + "rewards/length2tails_reward/std": 0.29664987325668335, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9256644248962402, + "rewards/thermo_reward/std": 1.3327876329421997, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.6875, + "completions/mean_terminated_length": 269.6875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10017648991197348, + "epoch": 0.97, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12505213916301727, + "learning_rate": 1.8491034182221936e-06, + "loss": -0.0048, + "num_tokens": 4210268.0, + "reward": 11.93092155456543, + "reward_std": 3.9217958450317383, + "rewards/fitness_reward/mean": 6.938035011291504, + "rewards/fitness_reward/std": 1.78042471408844, + "rewards/kidney_reward/mean": 2.246797800064087, + "rewards/kidney_reward/std": 0.8911625742912292, + "rewards/length2tails_reward/mean": 0.5875948667526245, + "rewards/length2tails_reward/std": 0.3603399097919464, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.5873284339904785, + "rewards/thermo_reward/std": 1.9247843027114868, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.46875, + "completions/mean_terminated_length": 270.46875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.0971584739163518, + "epoch": 0.972, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11438634991645813, + "learning_rate": 1.8484253844463524e-06, + "loss": -0.0076, + "num_tokens": 4218955.0, + "reward": 12.453248977661133, + "reward_std": 2.544297218322754, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.3667173385620117, + "rewards/kidney_reward/std": 0.725242018699646, + "rewards/length2tails_reward/mean": 0.5997471213340759, + "rewards/length2tails_reward/std": 0.3877236247062683, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.622880458831787, + "rewards/thermo_reward/std": 1.9670342206954956, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.78125, + "completions/mean_terminated_length": 272.78125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.10207152832299471, + "epoch": 0.974, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08598785102367401, + "learning_rate": 1.8477459556481437e-06, + "loss": -0.004, + "num_tokens": 4227716.0, + "reward": 11.039971351623535, + "reward_std": 5.43131160736084, + "rewards/fitness_reward/mean": 6.3909101486206055, + "rewards/fitness_reward/std": 3.0708882808685303, + "rewards/kidney_reward/mean": 2.193580150604248, + "rewards/kidney_reward/std": 1.1655123233795166, + "rewards/length2tails_reward/mean": 0.7655168771743774, + "rewards/length2tails_reward/std": 0.3056708872318268, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.2789292335510254, + "rewards/thermo_reward/std": 2.220484495162964, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.1875, + "completions/mean_terminated_length": 271.1875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10653836652636528, + "epoch": 0.976, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07108797878026962, + "learning_rate": 1.8470651329447175e-06, + "loss": -0.006, + "num_tokens": 4236426.0, + "reward": 12.28726577758789, + "reward_std": 2.68487548828125, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.476879596710205, + "rewards/kidney_reward/std": 0.5689665675163269, + "rewards/length2tails_reward/mean": 0.6496505737304688, + "rewards/length2tails_reward/std": 0.39126691222190857, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.284236431121826, + "rewards/thermo_reward/std": 2.293436288833618, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.5, + "completions/mean_terminated_length": 270.5, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09677772503346205, + "epoch": 0.978, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05292100831866264, + "learning_rate": 1.8463829174555157e-06, + "loss": -0.0025, + "num_tokens": 4245114.0, + "reward": 11.512476921081543, + "reward_std": 6.075210094451904, + "rewards/fitness_reward/mean": 6.321190357208252, + "rewards/fitness_reward/std": 3.2969279289245605, + "rewards/kidney_reward/mean": 2.188544750213623, + "rewards/kidney_reward/std": 1.2047877311706543, + "rewards/length2tails_reward/mean": 0.6134219169616699, + "rewards/length2tails_reward/std": 0.3425354063510895, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8413994312286377, + "rewards/thermo_reward/std": 1.940303087234497, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.875, + "completions/mean_terminated_length": 270.875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10276855435222387, + "epoch": 0.98, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08010396361351013, + "learning_rate": 1.8456993103022703e-06, + "loss": -0.0096, + "num_tokens": 4253814.0, + "reward": 10.646703720092773, + "reward_std": 5.514558792114258, + "rewards/fitness_reward/mean": 6.77551794052124, + "rewards/fitness_reward/std": 2.1163885593414307, + "rewards/kidney_reward/mean": 1.8354774713516235, + "rewards/kidney_reward/std": 1.6018898487091064, + "rewards/length2tails_reward/mean": 0.6580426692962646, + "rewards/length2tails_reward/std": 0.3390595018863678, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.8699034452438354, + "rewards/thermo_reward/std": 2.7576637268066406, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.09375, + "completions/mean_terminated_length": 272.09375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10569905303418636, + "epoch": 0.982, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06571035832166672, + "learning_rate": 1.8450143126090012e-06, + "loss": -0.005, + "num_tokens": 4262553.0, + "reward": 13.003958702087402, + "reward_std": 2.756281852722168, + "rewards/fitness_reward/mean": 6.987574577331543, + "rewards/fitness_reward/std": 2.1134605407714844, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7179726362228394, + "rewards/length2tails_reward/std": 0.3528580963611603, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2728257179260254, + "rewards/thermo_reward/std": 1.1995137929916382, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.0, + "completions/mean_terminated_length": 272.0, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10283433459699154, + "epoch": 0.984, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08230716735124588, + "learning_rate": 1.844327925502015e-06, + "loss": -0.0026, + "num_tokens": 4271289.0, + "reward": 10.768950462341309, + "reward_std": 6.651513576507568, + "rewards/fitness_reward/mean": 6.357293128967285, + "rewards/fitness_reward/std": 3.1766951084136963, + "rewards/kidney_reward/mean": 2.013303756713867, + "rewards/kidney_reward/std": 1.531698226928711, + "rewards/length2tails_reward/mean": 0.6933887004852295, + "rewards/length2tails_reward/std": 0.32781982421875, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.2290148735046387, + "rewards/thermo_reward/std": 2.4779889583587646, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.90625, + "completions/mean_terminated_length": 271.90625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.10609956178814173, + "epoch": 0.986, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12279146909713745, + "learning_rate": 1.8436401501099033e-06, + "loss": 0.0054, + "num_tokens": 4280022.0, + "reward": 11.089923858642578, + "reward_std": 5.652596473693848, + "rewards/fitness_reward/mean": 6.703958034515381, + "rewards/fitness_reward/std": 2.588440179824829, + "rewards/kidney_reward/mean": 2.064419746398926, + "rewards/kidney_reward/std": 1.4529000520706177, + "rewards/length2tails_reward/mean": 0.7086784839630127, + "rewards/length2tails_reward/std": 0.3152945637702942, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.1506786346435547, + "rewards/thermo_reward/std": 2.2029199600219727, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.4375, + "completions/mean_terminated_length": 272.4375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11765718180686235, + "epoch": 0.988, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08861474692821503, + "learning_rate": 1.8429509875635394e-06, + "loss": 0.0, + "num_tokens": 4288772.0, + "reward": 12.592763900756836, + "reward_std": 2.527524471282959, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.450157403945923, + "rewards/kidney_reward/std": 0.5792059302330017, + "rewards/length2tails_reward/mean": 0.7417012453079224, + "rewards/length2tails_reward/std": 0.3148317039012909, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.664761781692505, + "rewards/thermo_reward/std": 1.8582063913345337, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.65625, + "completions/mean_terminated_length": 271.65625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10666430741548538, + "epoch": 0.99, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06550094485282898, + "learning_rate": 1.8422604389960781e-06, + "loss": -0.0037, + "num_tokens": 4297497.0, + "reward": 13.099773406982422, + "reward_std": 1.8023239374160767, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7044112682342529, + "rewards/length2tails_reward/std": 0.34925153851509094, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.996385097503662, + "rewards/thermo_reward/std": 1.6214781999588013, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.96875, + "completions/mean_terminated_length": 272.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11526863928884268, + "epoch": 0.992, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06436719745397568, + "learning_rate": 1.841568505542953e-06, + "loss": 0.0007, + "num_tokens": 4306264.0, + "reward": 12.25389575958252, + "reward_std": 4.86123514175415, + "rewards/fitness_reward/mean": 6.996882438659668, + "rewards/fitness_reward/std": 2.060805082321167, + "rewards/kidney_reward/mean": 2.236731767654419, + "rewards/kidney_reward/std": 1.2397674322128296, + "rewards/length2tails_reward/mean": 0.796875, + "rewards/length2tails_reward/std": 0.2885507345199585, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8405933380126953, + "rewards/thermo_reward/std": 2.0179951190948486, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.125, + "completions/mean_terminated_length": 271.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11093102768063545, + "epoch": 0.994, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10350531339645386, + "learning_rate": 1.8408751883418752e-06, + "loss": -0.0026, + "num_tokens": 4314972.0, + "reward": 11.615776062011719, + "reward_std": 5.574860572814941, + "rewards/fitness_reward/mean": 6.668831825256348, + "rewards/fitness_reward/std": 2.5053114891052246, + "rewards/kidney_reward/mean": 2.082124710083008, + "rewards/kidney_reward/std": 1.3706881999969482, + "rewards/length2tails_reward/mean": 0.645334005355835, + "rewards/length2tails_reward/std": 0.36578190326690674, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.700286865234375, + "rewards/thermo_reward/std": 2.2157039642333984, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1063738688826561, + "epoch": 0.996, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07675768435001373, + "learning_rate": 1.84018048853283e-06, + "loss": -0.003, + "num_tokens": 4323707.0, + "reward": 12.359034538269043, + "reward_std": 3.6783368587493896, + "rewards/fitness_reward/mean": 6.99554443359375, + "rewards/fitness_reward/std": 1.7628074884414673, + "rewards/kidney_reward/mean": 2.3726367950439453, + "rewards/kidney_reward/std": 0.804735004901886, + "rewards/length2tails_reward/mean": 0.6753664612770081, + "rewards/length2tails_reward/std": 0.3424879014492035, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.823317050933838, + "rewards/thermo_reward/std": 1.7088568210601807, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.96875, + "completions/mean_terminated_length": 270.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09414213243871927, + "epoch": 0.998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08835254609584808, + "learning_rate": 1.8394844072580772e-06, + "loss": -0.0066, + "num_tokens": 4332410.0, + "reward": 12.498444557189941, + "reward_std": 2.584066152572632, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.452592670917511, + "rewards/kidney_reward/mean": 2.3080384731292725, + "rewards/kidney_reward/std": 1.0372236967086792, + "rewards/length2tails_reward/mean": 0.6141963601112366, + "rewards/length2tails_reward/std": 0.36699378490448, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.782820463180542, + "rewards/thermo_reward/std": 1.6219178438186646, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.28125, + "completions/mean_terminated_length": 271.28125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11134423781186342, + "epoch": 1.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.054818034172058105, + "learning_rate": 1.8387869456621473e-06, + "loss": -0.0017, + "num_tokens": 4341123.0, + "reward": 11.53868293762207, + "reward_std": 5.201396465301514, + "rewards/fitness_reward/mean": 6.6967315673828125, + "rewards/fitness_reward/std": 2.6219189167022705, + "rewards/kidney_reward/mean": 2.3333699703216553, + "rewards/kidney_reward/std": 1.1131701469421387, + "rewards/length2tails_reward/mean": 0.70151686668396, + "rewards/length2tails_reward/std": 0.3320710062980652, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.3384299278259277, + "rewards/thermo_reward/std": 2.0443801879882812, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.46875, + "completions/mean_terminated_length": 272.46875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.1253562057390809, + "epoch": 1.002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06054362282156944, + "learning_rate": 1.8380881048918404e-06, + "loss": 0.002, + "num_tokens": 4349874.0, + "reward": 12.315496444702148, + "reward_std": 4.542181968688965, + "rewards/fitness_reward/mean": 7.001006126403809, + "rewards/fitness_reward/std": 2.0374791622161865, + "rewards/kidney_reward/mean": 2.3622498512268066, + "rewards/kidney_reward/std": 1.04790198802948, + "rewards/length2tails_reward/mean": 0.7722654342651367, + "rewards/length2tails_reward/std": 0.2462671995162964, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.775014877319336, + "rewards/thermo_reward/std": 1.8799045085906982, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.25, + "completions/mean_terminated_length": 271.25, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11229314375668764, + "epoch": 1.004, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24925032258033752, + "learning_rate": 1.8373878860962253e-06, + "loss": 0.0003, + "num_tokens": 4358586.0, + "reward": 11.532876968383789, + "reward_std": 5.967067241668701, + "rewards/fitness_reward/mean": 6.932981967926025, + "rewards/fitness_reward/std": 2.4222817420959473, + "rewards/kidney_reward/mean": 2.1588492393493652, + "rewards/kidney_reward/std": 1.5926848649978638, + "rewards/length2tails_reward/mean": 0.6957947015762329, + "rewards/length2tails_reward/std": 0.29977282881736755, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.271465301513672, + "rewards/thermo_reward/std": 2.5201990604400635, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.03125, + "completions/mean_terminated_length": 272.03125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0931809302419424, + "epoch": 1.006, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06151973456144333, + "learning_rate": 1.8366862904266354e-06, + "loss": -0.0089, + "num_tokens": 4367323.0, + "reward": 11.453720092773438, + "reward_std": 4.2606096267700195, + "rewards/fitness_reward/mean": 6.628477096557617, + "rewards/fitness_reward/std": 2.486849546432495, + "rewards/kidney_reward/mean": 2.3373258113861084, + "rewards/kidney_reward/std": 0.7392154932022095, + "rewards/length2tails_reward/mean": 0.6723840236663818, + "rewards/length2tails_reward/std": 0.3951359689235687, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.3206787109375, + "rewards/thermo_reward/std": 1.9403377771377563, + "step": 503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.90625, + "completions/mean_terminated_length": 271.90625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10857211658731103, + "epoch": 1.008, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08904221653938293, + "learning_rate": 1.8359833190366683e-06, + "loss": -0.0083, + "num_tokens": 4376056.0, + "reward": 11.623477935791016, + "reward_std": 4.135257244110107, + "rewards/fitness_reward/mean": 6.969882488250732, + "rewards/fitness_reward/std": 1.9056702852249146, + "rewards/kidney_reward/mean": 2.1147756576538086, + "rewards/kidney_reward/std": 1.0822776556015015, + "rewards/length2tails_reward/mean": 0.7080065011978149, + "rewards/length2tails_reward/std": 0.3449402153491974, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.368018627166748, + "rewards/thermo_reward/std": 2.2309768199920654, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.875, + "completions/mean_terminated_length": 271.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1062892684713006, + "epoch": 1.01, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.058957964181900024, + "learning_rate": 1.835278973082184e-06, + "loss": -0.0043, + "num_tokens": 4384788.0, + "reward": 11.644981384277344, + "reward_std": 5.438553333282471, + "rewards/fitness_reward/mean": 6.697482585906982, + "rewards/fitness_reward/std": 2.618746280670166, + "rewards/kidney_reward/mean": 2.15616774559021, + "rewards/kidney_reward/std": 1.2606943845748901, + "rewards/length2tails_reward/mean": 0.7042187452316284, + "rewards/length2tails_reward/std": 0.3460327088832855, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.6209092140197754, + "rewards/thermo_reward/std": 2.147388458251953, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.1875, + "completions/mean_terminated_length": 270.1875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09534286428242922, + "epoch": 1.012, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07244093716144562, + "learning_rate": 1.8345732537213026e-06, + "loss": -0.0038, + "num_tokens": 4393466.0, + "reward": 10.906755447387695, + "reward_std": 5.130542278289795, + "rewards/fitness_reward/mean": 6.565263748168945, + "rewards/fitness_reward/std": 2.7279858589172363, + "rewards/kidney_reward/mean": 2.2643046379089355, + "rewards/kidney_reward/std": 1.112157940864563, + "rewards/length2tails_reward/mean": 0.5872292518615723, + "rewards/length2tails_reward/std": 0.38594943284988403, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.9184648990631104, + "rewards/thermo_reward/std": 2.4295599460601807, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.71875, + "completions/mean_terminated_length": 271.71875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11011694930493832, + "epoch": 1.014, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10459110885858536, + "learning_rate": 1.8338661621144022e-06, + "loss": -0.0045, + "num_tokens": 4402193.0, + "reward": 11.295709609985352, + "reward_std": 6.177306652069092, + "rewards/fitness_reward/mean": 6.025014877319336, + "rewards/fitness_reward/std": 3.602267026901245, + "rewards/kidney_reward/mean": 2.210982084274292, + "rewards/kidney_reward/std": 1.167697787284851, + "rewards/length2tails_reward/mean": 0.696526288986206, + "rewards/length2tails_reward/std": 0.3571210205554962, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.890059471130371, + "rewards/thermo_reward/std": 1.7832286357879639, + "step": 507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.09375, + "completions/mean_terminated_length": 271.09375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09178171120584011, + "epoch": 1.016, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0839139074087143, + "learning_rate": 1.833157699424117e-06, + "loss": -0.0069, + "num_tokens": 4410900.0, + "reward": 12.223529815673828, + "reward_std": 3.18105149269104, + "rewards/fitness_reward/mean": 7.0122480392456055, + "rewards/fitness_reward/std": 1.973885416984558, + "rewards/kidney_reward/mean": 2.4003708362579346, + "rewards/kidney_reward/std": 0.5782254934310913, + "rewards/length2tails_reward/mean": 0.65845787525177, + "rewards/length2tails_reward/std": 0.329045832157135, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.645064353942871, + "rewards/thermo_reward/std": 1.9347671270370483, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.10174894332885742, + "epoch": 1.018, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10819156467914581, + "learning_rate": 1.8324478668153366e-06, + "loss": -0.0057, + "num_tokens": 4419616.0, + "reward": 11.47518539428711, + "reward_std": 5.487979888916016, + "rewards/fitness_reward/mean": 6.920076370239258, + "rewards/fitness_reward/std": 2.183814287185669, + "rewards/kidney_reward/mean": 2.144624948501587, + "rewards/kidney_reward/std": 1.366068959236145, + "rewards/length2tails_reward/mean": 0.6623252630233765, + "rewards/length2tails_reward/std": 0.3512051999568939, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.244251012802124, + "rewards/thermo_reward/std": 2.4937503337860107, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 267.28125, + "completions/mean_terminated_length": 267.28125, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.09971592016518116, + "epoch": 1.02, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2520746886730194, + "learning_rate": 1.8317366654552013e-06, + "loss": -0.0522, + "num_tokens": 4428201.0, + "reward": 10.527538299560547, + "reward_std": 6.803934574127197, + "rewards/fitness_reward/mean": 6.230249881744385, + "rewards/fitness_reward/std": 3.576293468475342, + "rewards/kidney_reward/mean": 2.0987014770507812, + "rewards/kidney_reward/std": 1.649312973022461, + "rewards/length2tails_reward/mean": 0.6444635391235352, + "rewards/length2tails_reward/std": 0.3857713043689728, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.0341413021087646, + "rewards/thermo_reward/std": 2.3133511543273926, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1145568871870637, + "epoch": 1.022, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04647472873330116, + "learning_rate": 1.8310240965131038e-06, + "loss": -0.0035, + "num_tokens": 4436922.0, + "reward": 11.225116729736328, + "reward_std": 5.405675411224365, + "rewards/fitness_reward/mean": 6.641835689544678, + "rewards/fitness_reward/std": 2.6161036491394043, + "rewards/kidney_reward/mean": 2.1738617420196533, + "rewards/kidney_reward/std": 1.2172448635101318, + "rewards/length2tails_reward/mean": 0.7327454686164856, + "rewards/length2tails_reward/std": 0.29265257716178894, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.2361443042755127, + "rewards/thermo_reward/std": 2.1350820064544678, + "step": 511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.75, + "completions/mean_terminated_length": 270.75, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10694421455264091, + "epoch": 1.024, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09988442808389664, + "learning_rate": 1.8303101611606847e-06, + "loss": -0.0026, + "num_tokens": 4445618.0, + "reward": 11.589323043823242, + "reward_std": 3.4726717472076416, + "rewards/fitness_reward/mean": 6.948733806610107, + "rewards/fitness_reward/std": 2.023653745651245, + "rewards/kidney_reward/mean": 2.2359910011291504, + "rewards/kidney_reward/std": 0.8204216957092285, + "rewards/length2tails_reward/mean": 0.6413706541061401, + "rewards/length2tails_reward/std": 0.3659186363220215, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.2404606342315674, + "rewards/thermo_reward/std": 2.057912826538086, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.75, + "completions/mean_terminated_length": 269.75, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "entropy": 0.0958978058770299, + "epoch": 1.026, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1153431311249733, + "learning_rate": 1.8295948605718311e-06, + "loss": -0.0033, + "num_tokens": 4454282.0, + "reward": 11.745344161987305, + "reward_std": 5.108170986175537, + "rewards/fitness_reward/mean": 6.678919792175293, + "rewards/fitness_reward/std": 2.696547746658325, + "rewards/kidney_reward/mean": 2.1953866481781006, + "rewards/kidney_reward/std": 1.3030292987823486, + "rewards/length2tails_reward/mean": 0.7217558026313782, + "rewards/length2tails_reward/std": 0.306129515171051, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.698862075805664, + "rewards/thermo_reward/std": 2.0775516033172607, + "step": 513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.6875, + "completions/mean_terminated_length": 270.6875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10792504902929068, + "epoch": 1.028, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1147734671831131, + "learning_rate": 1.828878195922675e-06, + "loss": -0.0057, + "num_tokens": 4462976.0, + "reward": 11.455698013305664, + "reward_std": 3.909986734390259, + "rewards/fitness_reward/mean": 6.936039924621582, + "rewards/fitness_reward/std": 1.791343092918396, + "rewards/kidney_reward/mean": 2.275510549545288, + "rewards/kidney_reward/std": 1.045230746269226, + "rewards/length2tails_reward/mean": 0.6162968277931213, + "rewards/length2tails_reward/std": 0.3768027722835541, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.0825178623199463, + "rewards/thermo_reward/std": 2.351339340209961, + "step": 514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09783510770648718, + "epoch": 1.03, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08594634383916855, + "learning_rate": 1.8281601683915914e-06, + "loss": -0.0072, + "num_tokens": 4471692.0, + "reward": 11.942680358886719, + "reward_std": 4.303277015686035, + "rewards/fitness_reward/mean": 7.026032447814941, + "rewards/fitness_reward/std": 1.8959089517593384, + "rewards/kidney_reward/mean": 2.2736551761627197, + "rewards/kidney_reward/std": 0.9514713883399963, + "rewards/length2tails_reward/mean": 0.6495813131332397, + "rewards/length2tails_reward/std": 0.35913750529289246, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.478034496307373, + "rewards/thermo_reward/std": 2.3042123317718506, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.34375, + "completions/mean_terminated_length": 271.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10969353280961514, + "epoch": 1.032, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06955910474061966, + "learning_rate": 1.8274407791591963e-06, + "loss": -0.0085, + "num_tokens": 4480407.0, + "reward": 12.465177536010742, + "reward_std": 3.026686429977417, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.452592670917511, + "rewards/kidney_reward/mean": 2.355340003967285, + "rewards/kidney_reward/std": 0.8772143721580505, + "rewards/length2tails_reward/mean": 0.7324072122573853, + "rewards/length2tails_reward/std": 0.2933201789855957, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.690430164337158, + "rewards/thermo_reward/std": 1.988502025604248, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10960424691438675, + "epoch": 1.034, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09094872325658798, + "learning_rate": 1.8267200294083446e-06, + "loss": -0.0049, + "num_tokens": 4489171.0, + "reward": 12.051908493041992, + "reward_std": 3.2191474437713623, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.650642991065979, + "rewards/kidney_reward/mean": 2.3650550842285156, + "rewards/kidney_reward/std": 0.7524277567863464, + "rewards/length2tails_reward/mean": 0.7414542436599731, + "rewards/length2tails_reward/std": 0.29311615228652954, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.2665421962738037, + "rewards/thermo_reward/std": 2.147674322128296, + "step": 517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.71875, + "completions/mean_terminated_length": 271.71875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09909907821565866, + "epoch": 1.036, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07928860932588577, + "learning_rate": 1.8259979203241278e-06, + "loss": -0.0026, + "num_tokens": 4497898.0, + "reward": 12.741703033447266, + "reward_std": 2.7505831718444824, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.452592670917511, + "rewards/kidney_reward/mean": 2.4167726039886475, + "rewards/kidney_reward/std": 0.7528964281082153, + "rewards/length2tails_reward/mean": 0.6687544584274292, + "rewards/length2tails_reward/std": 0.3758578896522522, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9118881225585938, + "rewards/thermo_reward/std": 1.768688440322876, + "step": 518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.71875, + "completions/mean_terminated_length": 269.71875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08322983188554645, + "epoch": 1.038, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14451418817043304, + "learning_rate": 1.8252744530938734e-06, + "loss": -0.0065, + "num_tokens": 4506561.0, + "reward": 11.721282958984375, + "reward_std": 4.508687973022461, + "rewards/fitness_reward/mean": 6.937592506408691, + "rewards/fitness_reward/std": 1.7828460931777954, + "rewards/kidney_reward/mean": 2.0562925338745117, + "rewards/kidney_reward/std": 1.4569692611694336, + "rewards/length2tails_reward/mean": 0.5203403234481812, + "rewards/length2tails_reward/std": 0.38675656914711, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.575364112854004, + "rewards/thermo_reward/std": 2.1809372901916504, + "step": 519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 270.65625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10837945714592934, + "epoch": 1.04, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07109640538692474, + "learning_rate": 1.8245496289071406e-06, + "loss": -0.0024, + "num_tokens": 4515254.0, + "reward": 11.002652168273926, + "reward_std": 6.390777111053467, + "rewards/fitness_reward/mean": 6.532375812530518, + "rewards/fitness_reward/std": 2.853579044342041, + "rewards/kidney_reward/mean": 2.061156988143921, + "rewards/kidney_reward/std": 1.530630111694336, + "rewards/length2tails_reward/mean": 0.6790802478790283, + "rewards/length2tails_reward/std": 0.2736893594264984, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.2412116527557373, + "rewards/thermo_reward/std": 2.4915144443511963, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.34375, + "completions/mean_terminated_length": 272.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11088752839714289, + "epoch": 1.042, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12135175615549088, + "learning_rate": 1.8238234489557214e-06, + "loss": -0.0017, + "num_tokens": 4524001.0, + "reward": 12.2737398147583, + "reward_std": 3.4874472618103027, + "rewards/fitness_reward/mean": 7.131148338317871, + "rewards/fitness_reward/std": 0.905185341835022, + "rewards/kidney_reward/mean": 2.2964911460876465, + "rewards/kidney_reward/std": 1.0294334888458252, + "rewards/length2tails_reward/mean": 0.7129369974136353, + "rewards/length2tails_reward/std": 0.3501805067062378, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.674807071685791, + "rewards/thermo_reward/std": 2.00650691986084, + "step": 521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.75, + "completions/mean_terminated_length": 270.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10240734275430441, + "epoch": 1.044, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08196910470724106, + "learning_rate": 1.8230959144336361e-06, + "loss": -0.0046, + "num_tokens": 4532697.0, + "reward": 11.078343391418457, + "reward_std": 3.847487688064575, + "rewards/fitness_reward/mean": 7.131148338317871, + "rewards/fitness_reward/std": 0.6183593273162842, + "rewards/kidney_reward/mean": 1.8429230451583862, + "rewards/kidney_reward/std": 1.2928483486175537, + "rewards/length2tails_reward/mean": 0.5630030632019043, + "rewards/length2tails_reward/std": 0.3994128406047821, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.947972059249878, + "rewards/thermo_reward/std": 2.4602742195129395, + "step": 522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.09375, + "completions/mean_terminated_length": 271.09375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10700604878365993, + "epoch": 1.046, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5880448818206787, + "learning_rate": 1.8223670265371328e-06, + "loss": -0.0068, + "num_tokens": 4541404.0, + "reward": 11.866726875305176, + "reward_std": 5.31985330581665, + "rewards/fitness_reward/mean": 6.681757926940918, + "rewards/fitness_reward/std": 2.685328483581543, + "rewards/kidney_reward/mean": 2.2746706008911133, + "rewards/kidney_reward/std": 1.2622480392456055, + "rewards/length2tails_reward/mean": 0.6555532217025757, + "rewards/length2tails_reward/std": 0.3372742831707001, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.7447428703308105, + "rewards/thermo_reward/std": 2.023343563079834, + "step": 523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.3125, + "completions/mean_terminated_length": 273.3125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1021508201956749, + "epoch": 1.048, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07593434303998947, + "learning_rate": 1.8216367864646836e-06, + "loss": -0.0024, + "num_tokens": 4550182.0, + "reward": 10.609359741210938, + "reward_std": 6.6359124183654785, + "rewards/fitness_reward/mean": 6.591131210327148, + "rewards/fitness_reward/std": 2.8076703548431396, + "rewards/kidney_reward/mean": 1.8723961114883423, + "rewards/kidney_reward/std": 1.6987696886062622, + "rewards/length2tails_reward/mean": 0.7949404716491699, + "rewards/length2tails_reward/std": 0.2707257866859436, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.9663383960723877, + "rewards/thermo_reward/std": 2.863776206970215, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.71875, + "completions/mean_terminated_length": 271.71875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09915499482303858, + "epoch": 1.05, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14689137041568756, + "learning_rate": 1.8209051954169853e-06, + "loss": 0.0008, + "num_tokens": 4558909.0, + "reward": 11.72974681854248, + "reward_std": 5.813907146453857, + "rewards/fitness_reward/mean": 6.727794170379639, + "rewards/fitness_reward/std": 2.492391586303711, + "rewards/kidney_reward/mean": 2.1069676876068115, + "rewards/kidney_reward/std": 1.3520479202270508, + "rewards/length2tails_reward/mean": 0.6884158253669739, + "rewards/length2tails_reward/std": 0.3286331295967102, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.7261428833007812, + "rewards/thermo_reward/std": 2.316819906234741, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.34375, + "completions/mean_terminated_length": 269.34375, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "entropy": 0.10628684982657433, + "epoch": 1.052, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06357128173112869, + "learning_rate": 1.8201722545969557e-06, + "loss": -0.0012, + "num_tokens": 4567560.0, + "reward": 12.3041353225708, + "reward_std": 3.4844396114349365, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.4296531677246094, + "rewards/kidney_reward/std": 0.5578335523605347, + "rewards/length2tails_reward/mean": 0.6693383455276489, + "rewards/length2tails_reward/std": 0.30575159192085266, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.654494285583496, + "rewards/thermo_reward/std": 1.891273856163025, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.25, + "completions/mean_terminated_length": 271.25, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10698442067950964, + "epoch": 1.054, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1364608258008957, + "learning_rate": 1.8194379652097318e-06, + "loss": 0.0019, + "num_tokens": 4576272.0, + "reward": 11.943489074707031, + "reward_std": 3.0079591274261475, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.3153510093688965, + "rewards/kidney_reward/std": 0.8190693855285645, + "rewards/length2tails_reward/mean": 0.7093468308448792, + "rewards/length2tails_reward/std": 0.28690895438194275, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.1535279750823975, + "rewards/thermo_reward/std": 2.2502543926239014, + "step": 527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.11601835023611784, + "epoch": 1.056, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23851068317890167, + "learning_rate": 1.8187023284626676e-06, + "loss": 0.0002, + "num_tokens": 4585036.0, + "reward": 13.2262544631958, + "reward_std": 1.4058492183685303, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4896838665008545, + "rewards/kidney_reward/std": 0.3229711949825287, + "rewards/length2tails_reward/mean": 0.7839996218681335, + "rewards/length2tails_reward/std": 0.2844403088092804, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1969854831695557, + "rewards/thermo_reward/std": 1.1832919120788574, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.96875, + "completions/mean_terminated_length": 272.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11103155184537172, + "epoch": 1.058, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16242334246635437, + "learning_rate": 1.8179653455653337e-06, + "loss": -0.0042, + "num_tokens": 4593803.0, + "reward": 10.842390060424805, + "reward_std": 6.011104583740234, + "rewards/fitness_reward/mean": 6.547800064086914, + "rewards/fitness_reward/std": 2.580008029937744, + "rewards/kidney_reward/mean": 1.922468662261963, + "rewards/kidney_reward/std": 1.644716501235962, + "rewards/length2tails_reward/mean": 0.7669105529785156, + "rewards/length2tails_reward/std": 0.3401075005531311, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.195430278778076, + "rewards/thermo_reward/std": 2.555251359939575, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.6875, + "completions/mean_terminated_length": 271.6875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10503857117146254, + "epoch": 1.06, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08899249881505966, + "learning_rate": 1.8172270177295124e-06, + "loss": -0.0073, + "num_tokens": 4602529.0, + "reward": 12.144147872924805, + "reward_std": 3.435709238052368, + "rewards/fitness_reward/mean": 7.050104141235352, + "rewards/fitness_reward/std": 1.7597377300262451, + "rewards/kidney_reward/mean": 2.4563241004943848, + "rewards/kidney_reward/std": 0.5482203960418701, + "rewards/length2tails_reward/mean": 0.691953182220459, + "rewards/length2tails_reward/std": 0.35609135031700134, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.4685235023498535, + "rewards/thermo_reward/std": 1.832617163658142, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.9375, + "completions/mean_terminated_length": 270.9375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.0973681602627039, + "epoch": 1.062, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05948628485202789, + "learning_rate": 1.8164873461691986e-06, + "loss": -0.0036, + "num_tokens": 4611231.0, + "reward": 11.227486610412598, + "reward_std": 5.46626091003418, + "rewards/fitness_reward/mean": 6.693514823913574, + "rewards/fitness_reward/std": 2.635545253753662, + "rewards/kidney_reward/mean": 2.0863637924194336, + "rewards/kidney_reward/std": 1.2824519872665405, + "rewards/length2tails_reward/mean": 0.6062266826629639, + "rewards/length2tails_reward/std": 0.4095900058746338, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.286984920501709, + "rewards/thermo_reward/std": 2.25620174407959, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.53125, + "completions/mean_terminated_length": 272.53125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10592686384916306, + "epoch": 1.064, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07250142097473145, + "learning_rate": 1.8157463321005966e-06, + "loss": -0.0022, + "num_tokens": 4619984.0, + "reward": 10.381654739379883, + "reward_std": 6.290080547332764, + "rewards/fitness_reward/mean": 6.5366058349609375, + "rewards/fitness_reward/std": 2.837371587753296, + "rewards/kidney_reward/mean": 1.9228678941726685, + "rewards/kidney_reward/std": 1.5469324588775635, + "rewards/length2tails_reward/mean": 0.7551746368408203, + "rewards/length2tails_reward/std": 0.28403139114379883, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.7466638088226318, + "rewards/thermo_reward/std": 2.5262794494628906, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.0, + "completions/mean_terminated_length": 272.0, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11166021507233381, + "epoch": 1.066, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08945536613464355, + "learning_rate": 1.8150039767421178e-06, + "loss": -0.0124, + "num_tokens": 4628720.0, + "reward": 10.623258590698242, + "reward_std": 5.707748889923096, + "rewards/fitness_reward/mean": 6.571508407592773, + "rewards/fitness_reward/std": 2.4442975521087646, + "rewards/kidney_reward/mean": 1.9549903869628906, + "rewards/kidney_reward/std": 1.4934356212615967, + "rewards/length2tails_reward/mean": 0.6912268996238708, + "rewards/length2tails_reward/std": 0.355633020401001, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.927636742591858, + "rewards/thermo_reward/std": 2.6435155868530273, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.3125, + "completions/mean_terminated_length": 270.3125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10576423350721598, + "epoch": 1.068, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08647046983242035, + "learning_rate": 1.8142602813143784e-06, + "loss": -0.0035, + "num_tokens": 4637402.0, + "reward": 12.811609268188477, + "reward_std": 2.2232515811920166, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5006520748138428, + "rewards/kidney_reward/std": 0.5732922554016113, + "rewards/length2tails_reward/mean": 0.6195791363716125, + "rewards/length2tails_reward/std": 0.3544762432575226, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.787814140319824, + "rewards/thermo_reward/std": 1.824695348739624, + "step": 534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.375, + "completions/mean_terminated_length": 270.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10622578114271164, + "epoch": 1.07, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1089043915271759, + "learning_rate": 1.8135152470401996e-06, + "loss": -0.0039, + "num_tokens": 4646086.0, + "reward": 9.206745147705078, + "reward_std": 8.957849502563477, + "rewards/fitness_reward/mean": 5.844204902648926, + "rewards/fitness_reward/std": 4.084497928619385, + "rewards/kidney_reward/mean": 1.6340656280517578, + "rewards/kidney_reward/std": 2.2367794513702393, + "rewards/length2tails_reward/mean": 0.6045545339584351, + "rewards/length2tails_reward/std": 0.37398043274879456, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.5680193901062012, + "rewards/thermo_reward/std": 3.0591702461242676, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.90625, + "completions/mean_terminated_length": 269.90625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.08916517347097397, + "epoch": 1.072, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08490046858787537, + "learning_rate": 1.8127688751446026e-06, + "loss": -0.0052, + "num_tokens": 4654755.0, + "reward": 11.566045761108398, + "reward_std": 4.726414680480957, + "rewards/fitness_reward/mean": 6.572394371032715, + "rewards/fitness_reward/std": 2.440897226333618, + "rewards/kidney_reward/mean": 2.2182819843292236, + "rewards/kidney_reward/std": 0.9536340832710266, + "rewards/length2tails_reward/mean": 0.5882750153541565, + "rewards/length2tails_reward/std": 0.3553884029388428, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 2.6227917671203613, + "rewards/thermo_reward/std": 1.9189839363098145, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.8125, + "completions/mean_terminated_length": 271.8125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11149153485894203, + "epoch": 1.074, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09485744684934616, + "learning_rate": 1.8120211668548086e-06, + "loss": -0.0049, + "num_tokens": 4663485.0, + "reward": 12.444345474243164, + "reward_std": 2.8540360927581787, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.3471357822418213, + "rewards/kidney_reward/std": 0.7977660298347473, + "rewards/length2tails_reward/mean": 0.7017210721969604, + "rewards/length2tails_reward/std": 0.3152361214160919, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.623361587524414, + "rewards/thermo_reward/std": 2.011770009994507, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.0, + "completions/mean_terminated_length": 272.0, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1078566275537014, + "epoch": 1.076, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06216276437044144, + "learning_rate": 1.8112721234002357e-06, + "loss": -0.0054, + "num_tokens": 4672221.0, + "reward": 13.001523971557617, + "reward_std": 1.9588590860366821, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.517043113708496, + "rewards/kidney_reward/std": 0.2941751182079315, + "rewards/length2tails_reward/mean": 0.7192487716674805, + "rewards/length2tails_reward/std": 0.333200067281723, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.951371192932129, + "rewards/thermo_reward/std": 1.7540698051452637, + "step": 538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 271.1875, + "completions/mean_terminated_length": 271.1875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09430662263184786, + "epoch": 1.078, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5271546244621277, + "learning_rate": 1.810521746012498e-06, + "loss": 0.0065, + "num_tokens": 4680931.0, + "reward": 12.489099502563477, + "reward_std": 4.14736795425415, + "rewards/fitness_reward/mean": 7.024723052978516, + "rewards/fitness_reward/std": 1.9033170938491821, + "rewards/kidney_reward/mean": 2.435086250305176, + "rewards/kidney_reward/std": 0.9357979893684387, + "rewards/length2tails_reward/mean": 0.623183012008667, + "rewards/length2tails_reward/std": 0.3528957962989807, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8669726848602295, + "rewards/thermo_reward/std": 1.8115577697753906, + "step": 539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.5, + "completions/mean_terminated_length": 271.5, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11281314957886934, + "epoch": 1.08, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09693945199251175, + "learning_rate": 1.8097700359254024e-06, + "loss": -0.0003, + "num_tokens": 4689651.0, + "reward": 12.46048355102539, + "reward_std": 2.768707752227783, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.3303580284118652, + "rewards/kidney_reward/std": 0.8740337491035461, + "rewards/length2tails_reward/mean": 0.6922803521156311, + "rewards/length2tails_reward/std": 0.3110790252685547, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.599713087081909, + "rewards/thermo_reward/std": 2.1388442516326904, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10293409042060375, + "epoch": 1.082, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06482190638780594, + "learning_rate": 1.8090169943749474e-06, + "loss": -0.0045, + "num_tokens": 4698382.0, + "reward": 12.603124618530273, + "reward_std": 3.6950109004974365, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.3166470527648926, + "rewards/kidney_reward/std": 0.8981736898422241, + "rewards/length2tails_reward/mean": 0.7382228374481201, + "rewards/length2tails_reward/std": 0.30910226702690125, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0596022605895996, + "rewards/thermo_reward/std": 1.7408092021942139, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.5625, + "completions/mean_terminated_length": 271.5625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10494351293891668, + "epoch": 1.084, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08620021492242813, + "learning_rate": 1.8082626225993205e-06, + "loss": 0.0015, + "num_tokens": 4707104.0, + "reward": 10.572208404541016, + "reward_std": 6.045322895050049, + "rewards/fitness_reward/mean": 6.605555534362793, + "rewards/fitness_reward/std": 2.77128529548645, + "rewards/kidney_reward/mean": 2.061997413635254, + "rewards/kidney_reward/std": 1.4656015634536743, + "rewards/length2tails_reward/mean": 0.6626001596450806, + "rewards/length2tails_reward/std": 0.3383328318595886, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.7383958101272583, + "rewards/thermo_reward/std": 2.649104118347168, + "step": 542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.0, + "completions/mean_terminated_length": 271.0, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.10786995757371187, + "epoch": 1.086, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0641787126660347, + "learning_rate": 1.8075069218388962e-06, + "loss": 0.0015, + "num_tokens": 4715808.0, + "reward": 12.383892059326172, + "reward_std": 4.372714042663574, + "rewards/fitness_reward/mean": 7.013314723968506, + "rewards/fitness_reward/std": 1.9678521156311035, + "rewards/kidney_reward/mean": 2.3276991844177246, + "rewards/kidney_reward/std": 0.9477407336235046, + "rewards/length2tails_reward/mean": 0.6996359825134277, + "rewards/length2tails_reward/std": 0.27757784724235535, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.872915267944336, + "rewards/thermo_reward/std": 1.9278887510299683, + "step": 543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.71875, + "completions/mean_terminated_length": 270.71875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10373136959969997, + "epoch": 1.088, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08191133290529251, + "learning_rate": 1.8067498933362355e-06, + "loss": -0.0008, + "num_tokens": 4724503.0, + "reward": 12.963687896728516, + "reward_std": 2.7823119163513184, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.405520439147949, + "rewards/kidney_reward/std": 0.8700054883956909, + "rewards/length2tails_reward/mean": 0.6506054997444153, + "rewards/length2tails_reward/std": 0.36144185066223145, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0319223403930664, + "rewards/thermo_reward/std": 2.0205771923065186, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.03125, + "completions/mean_terminated_length": 271.03125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10066128429025412, + "epoch": 1.09, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07312504202127457, + "learning_rate": 1.8059915383360806e-06, + "loss": -0.0055, + "num_tokens": 4733208.0, + "reward": 12.294307708740234, + "reward_std": 3.056042432785034, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.350677251815796, + "rewards/kidney_reward/std": 0.8858228921890259, + "rewards/length2tails_reward/mean": 0.6339554786682129, + "rewards/length2tails_reward/std": 0.3406355082988739, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.4765591621398926, + "rewards/thermo_reward/std": 2.209420680999756, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.12119703646749258, + "epoch": 1.092, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08009244501590729, + "learning_rate": 1.805231858085356e-06, + "loss": -0.002, + "num_tokens": 4741972.0, + "reward": 12.57183837890625, + "reward_std": 1.7864048480987549, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.517043113708496, + "rewards/kidney_reward/std": 0.2941751182079315, + "rewards/length2tails_reward/mean": 0.7900687456130981, + "rewards/length2tails_reward/std": 0.25023266673088074, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.514604091644287, + "rewards/thermo_reward/std": 1.6984219551086426, + "step": 546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10524578206241131, + "epoch": 1.094, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0970984622836113, + "learning_rate": 1.8044708538331654e-06, + "loss": -0.0047, + "num_tokens": 4750693.0, + "reward": 12.265960693359375, + "reward_std": 3.2419352531433105, + "rewards/fitness_reward/mean": 6.930270195007324, + "rewards/fitness_reward/std": 2.1268088817596436, + "rewards/kidney_reward/mean": 2.4896838665008545, + "rewards/kidney_reward/std": 0.3229711949825287, + "rewards/length2tails_reward/mean": 0.6985119581222534, + "rewards/length2tails_reward/std": 0.33531445264816284, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.6761550903320312, + "rewards/thermo_reward/std": 1.9302542209625244, + "step": 547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.75, + "completions/mean_terminated_length": 270.75, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10378769133239985, + "epoch": 1.096, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06036684662103653, + "learning_rate": 1.8037085268307885e-06, + "loss": -0.0035, + "num_tokens": 4759389.0, + "reward": 13.266702651977539, + "reward_std": 1.3902013301849365, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.6663521528244019, + "rewards/length2tails_reward/std": 0.30723318457603455, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1671204566955566, + "rewards/thermo_reward/std": 1.2469242811203003, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.15625, + "completions/mean_terminated_length": 270.15625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10272781364619732, + "epoch": 1.098, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2102782130241394, + "learning_rate": 1.8029448783316813e-06, + "loss": -0.0062, + "num_tokens": 4768066.0, + "reward": 11.990425109863281, + "reward_std": 5.225229740142822, + "rewards/fitness_reward/mean": 6.418025493621826, + "rewards/fitness_reward/std": 2.9806253910064697, + "rewards/kidney_reward/mean": 2.3426103591918945, + "rewards/kidney_reward/std": 0.9024714231491089, + "rewards/length2tails_reward/mean": 0.600758969783783, + "rewards/length2tails_reward/std": 0.387471467256546, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.069714069366455, + "rewards/thermo_reward/std": 1.4123339653015137, + "step": 549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.6875, + "completions/mean_terminated_length": 271.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09544859733432531, + "epoch": 1.1, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07070183753967285, + "learning_rate": 1.8021799095914708e-06, + "loss": -0.0042, + "num_tokens": 4776792.0, + "reward": 12.756484985351562, + "reward_std": 2.2132277488708496, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.503726005554199, + "rewards/kidney_reward/std": 0.5565682649612427, + "rewards/length2tails_reward/mean": 0.643738865852356, + "rewards/length2tails_reward/std": 0.368051677942276, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.7272000312805176, + "rewards/thermo_reward/std": 1.876189947128296, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 271.4375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.1080935113132, + "epoch": 1.102, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10448921471834183, + "learning_rate": 1.8014136218679566e-06, + "loss": 0.0011, + "num_tokens": 4785510.0, + "reward": 12.542806625366211, + "reward_std": 2.689107894897461, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.4469244480133057, + "rewards/kidney_reward/std": 0.7286657691001892, + "rewards/length2tails_reward/mean": 0.6926784515380859, + "rewards/length2tails_reward/std": 0.2712414860725403, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.622939109802246, + "rewards/thermo_reward/std": 1.9818955659866333, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10477001033723354, + "epoch": 1.104, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1856602132320404, + "learning_rate": 1.800646016421106e-06, + "loss": -0.0055, + "num_tokens": 4794268.0, + "reward": 12.144149780273438, + "reward_std": 4.565323352813721, + "rewards/fitness_reward/mean": 6.996285438537598, + "rewards/fitness_reward/std": 2.064185857772827, + "rewards/kidney_reward/mean": 2.3954811096191406, + "rewards/kidney_reward/std": 1.0103658437728882, + "rewards/length2tails_reward/mean": 0.7687793970108032, + "rewards/length2tails_reward/std": 0.2617262005805969, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.5755057334899902, + "rewards/thermo_reward/std": 2.078153610229492, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.3125, + "completions/mean_terminated_length": 272.3125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09166521392762661, + "epoch": 1.106, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08722712099552155, + "learning_rate": 1.7998770945130538e-06, + "loss": -0.0095, + "num_tokens": 4803014.0, + "reward": 11.275504112243652, + "reward_std": 4.791029930114746, + "rewards/fitness_reward/mean": 6.587876796722412, + "rewards/fitness_reward/std": 2.605294704437256, + "rewards/kidney_reward/mean": 2.1699628829956055, + "rewards/kidney_reward/std": 1.2778137922286987, + "rewards/length2tails_reward/mean": 0.6810904741287231, + "rewards/length2tails_reward/std": 0.37631070613861084, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.349555730819702, + "rewards/thermo_reward/std": 2.1994354724884033, + "step": 553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.71875, + "completions/mean_terminated_length": 271.71875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1041144272312522, + "epoch": 1.108, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07147329300642014, + "learning_rate": 1.7991068574080986e-06, + "loss": -0.0033, + "num_tokens": 4811741.0, + "reward": 11.137859344482422, + "reward_std": 5.557100772857666, + "rewards/fitness_reward/mean": 6.831142425537109, + "rewards/fitness_reward/std": 2.1162006855010986, + "rewards/kidney_reward/mean": 2.0171127319335938, + "rewards/kidney_reward/std": 1.558064341545105, + "rewards/length2tails_reward/mean": 0.6767317056655884, + "rewards/length2tails_reward/std": 0.36727866530418396, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.1219303607940674, + "rewards/thermo_reward/std": 2.5796494483947754, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.96875, + "completions/mean_terminated_length": 270.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10824454110115767, + "epoch": 1.11, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0778418481349945, + "learning_rate": 1.7983353063727014e-06, + "loss": -0.0079, + "num_tokens": 4820444.0, + "reward": 13.1099853515625, + "reward_std": 1.6137299537658691, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.536646842956543, + "rewards/kidney_reward/std": 0.5081712007522583, + "rewards/length2tails_reward/mean": 0.694353461265564, + "rewards/length2tails_reward/std": 0.31348589062690735, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.042717933654785, + "rewards/thermo_reward/std": 1.2481979131698608, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 270.5625, + "completions/mean_terminated_length": 270.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09657656960189342, + "epoch": 1.112, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10516510903835297, + "learning_rate": 1.7975624426754845e-06, + "loss": -0.0027, + "num_tokens": 4829134.0, + "reward": 11.269773483276367, + "reward_std": 6.188965797424316, + "rewards/fitness_reward/mean": 6.2905683517456055, + "rewards/fitness_reward/std": 3.396219253540039, + "rewards/kidney_reward/mean": 2.1874663829803467, + "rewards/kidney_reward/std": 1.343290090560913, + "rewards/length2tails_reward/mean": 0.6322581171989441, + "rewards/length2tails_reward/std": 0.3359857499599457, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.628514051437378, + "rewards/thermo_reward/std": 1.8966007232666016, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.40625, + "completions/mean_terminated_length": 271.40625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11595190409570932, + "epoch": 1.114, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1653381735086441, + "learning_rate": 1.7967882675872276e-06, + "loss": 0.0036, + "num_tokens": 4837851.0, + "reward": 11.56589126586914, + "reward_std": 5.274896144866943, + "rewards/fitness_reward/mean": 6.966367721557617, + "rewards/fitness_reward/std": 2.2334251403808594, + "rewards/kidney_reward/mean": 2.231813430786133, + "rewards/kidney_reward/std": 1.3505194187164307, + "rewards/length2tails_reward/mean": 0.6987495422363281, + "rewards/length2tails_reward/std": 0.3107140362262726, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.1978354454040527, + "rewards/thermo_reward/std": 2.329174518585205, + "step": 557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.40625, + "completions/mean_terminated_length": 272.40625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11444343067705631, + "epoch": 1.116, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08911436796188354, + "learning_rate": 1.7960127823808678e-06, + "loss": 0.0022, + "num_tokens": 4846600.0, + "reward": 11.401962280273438, + "reward_std": 5.1144914627075195, + "rewards/fitness_reward/mean": 6.89082145690918, + "rewards/fitness_reward/std": 2.0927488803863525, + "rewards/kidney_reward/mean": 2.1250972747802734, + "rewards/kidney_reward/std": 1.3569532632827759, + "rewards/length2tails_reward/mean": 0.7734675407409668, + "rewards/length2tails_reward/std": 0.2523795962333679, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.2086966037750244, + "rewards/thermo_reward/std": 2.2721035480499268, + "step": 558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.65625, + "completions/mean_terminated_length": 271.65625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10886608902364969, + "epoch": 1.1179999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08465147018432617, + "learning_rate": 1.7952359883314953e-06, + "loss": -0.006, + "num_tokens": 4855325.0, + "reward": 12.46010684967041, + "reward_std": 2.8142449855804443, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.334458827972412, + "rewards/kidney_reward/std": 0.851756751537323, + "rewards/length2tails_reward/mean": 0.7321093082427979, + "rewards/length2tails_reward/std": 0.2910517752170563, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.591252088546753, + "rewards/thermo_reward/std": 2.0499556064605713, + "step": 559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.59375, + "completions/mean_terminated_length": 272.59375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09685478266328573, + "epoch": 1.12, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09345784038305283, + "learning_rate": 1.7944578867163528e-06, + "loss": -0.0012, + "num_tokens": 4864080.0, + "reward": 11.20602035522461, + "reward_std": 6.437448978424072, + "rewards/fitness_reward/mean": 6.544711589813232, + "rewards/fitness_reward/std": 3.0064234733581543, + "rewards/kidney_reward/mean": 2.0772125720977783, + "rewards/kidney_reward/std": 1.6922893524169922, + "rewards/length2tails_reward/mean": 0.7151135802268982, + "rewards/length2tails_reward/std": 0.3349017798900604, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.4125847816467285, + "rewards/thermo_reward/std": 2.328195571899414, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 271.5, + "completions/mean_terminated_length": 271.5, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0986628895625472, + "epoch": 1.1219999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09137522429227829, + "learning_rate": 1.7936784788148325e-06, + "loss": -0.0059, + "num_tokens": 4872800.0, + "reward": 12.53663158416748, + "reward_std": 3.1190147399902344, + "rewards/fitness_reward/mean": 7.188657283782959, + "rewards/fitness_reward/std": 0.5449937582015991, + "rewards/kidney_reward/mean": 2.36080002784729, + "rewards/kidney_reward/std": 0.7742516994476318, + "rewards/length2tails_reward/mean": 0.688651442527771, + "rewards/length2tails_reward/std": 0.31537818908691406, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8183093070983887, + "rewards/thermo_reward/std": 2.1796393394470215, + "step": 561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.75, + "completions/mean_terminated_length": 269.75, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "entropy": 0.11182715278118849, + "epoch": 1.124, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.43471959233283997, + "learning_rate": 1.792897765908475e-06, + "loss": -0.0146, + "num_tokens": 4881464.0, + "reward": 9.44644546508789, + "reward_std": 8.440580368041992, + "rewards/fitness_reward/mean": 6.1534929275512695, + "rewards/fitness_reward/std": 3.818556308746338, + "rewards/kidney_reward/mean": 1.7774693965911865, + "rewards/kidney_reward/std": 2.152588367462158, + "rewards/length2tails_reward/mean": 0.6347315907478333, + "rewards/length2tails_reward/std": 0.38687554001808167, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.3520097732543945, + "rewards/thermo_reward/std": 3.0032525062561035, + "step": 562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.5625, + "completions/mean_terminated_length": 270.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09960361663252115, + "epoch": 1.126, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15998654067516327, + "learning_rate": 1.792115749280967e-06, + "loss": -0.009, + "num_tokens": 4890154.0, + "reward": 11.679988861083984, + "reward_std": 4.114315032958984, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.2523083686828613, + "rewards/kidney_reward/std": 0.9392135739326477, + "rewards/length2tails_reward/mean": 0.583635687828064, + "rewards/length2tails_reward/std": 0.39422547817230225, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.2162628173828125, + "rewards/thermo_reward/std": 2.487140655517578, + "step": 563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10299806576222181, + "epoch": 1.1280000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08876167237758636, + "learning_rate": 1.791332430218138e-06, + "loss": -0.0033, + "num_tokens": 4898918.0, + "reward": 11.167304992675781, + "reward_std": 5.945474624633789, + "rewards/fitness_reward/mean": 6.2546796798706055, + "rewards/fitness_reward/std": 3.3238136768341064, + "rewards/kidney_reward/mean": 2.1046230792999268, + "rewards/kidney_reward/std": 1.303746223449707, + "rewards/length2tails_reward/mean": 0.7680449485778809, + "rewards/length2tails_reward/std": 0.2990765869617462, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.631197929382324, + "rewards/thermo_reward/std": 1.9832183122634888, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 269.6875, + "completions/mean_terminated_length": 269.6875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09318594913929701, + "epoch": 1.13, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08641897886991501, + "learning_rate": 1.7905478100079596e-06, + "loss": -0.0062, + "num_tokens": 4907580.0, + "reward": 12.202686309814453, + "reward_std": 4.635074138641357, + "rewards/fitness_reward/mean": 6.691686153411865, + "rewards/fitness_reward/std": 2.6358587741851807, + "rewards/kidney_reward/mean": 2.3179144859313965, + "rewards/kidney_reward/std": 0.9202808141708374, + "rewards/length2tails_reward/mean": 0.5614801645278931, + "rewards/length2tails_reward/std": 0.38135483860969543, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.036937713623047, + "rewards/thermo_reward/std": 1.7109110355377197, + "step": 565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.0, + "completions/mean_terminated_length": 271.0, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10470135603100061, + "epoch": 1.1320000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05364145338535309, + "learning_rate": 1.7897618899405421e-06, + "loss": -0.0047, + "num_tokens": 4916284.0, + "reward": 13.045207023620605, + "reward_std": 2.1981866359710693, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.508059501647949, + "rewards/kidney_reward/std": 0.5330638289451599, + "rewards/length2tails_reward/mean": 0.6730586290359497, + "rewards/length2tails_reward/std": 0.27641263604164124, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0086560249328613, + "rewards/thermo_reward/std": 1.7391635179519653, + "step": 566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.34375, + "completions/mean_terminated_length": 271.34375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0999163817614317, + "epoch": 1.134, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08887635916471481, + "learning_rate": 1.7889746713081341e-06, + "loss": -0.0018, + "num_tokens": 4924999.0, + "reward": 11.056597709655762, + "reward_std": 6.2495503425598145, + "rewards/fitness_reward/mean": 6.679024696350098, + "rewards/fitness_reward/std": 2.6855037212371826, + "rewards/kidney_reward/mean": 1.9901930093765259, + "rewards/kidney_reward/std": 1.545816421508789, + "rewards/length2tails_reward/mean": 0.658514678478241, + "rewards/length2tails_reward/std": 0.3207051753997803, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.2215282917022705, + "rewards/thermo_reward/std": 2.6121809482574463, + "step": 567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10158183518797159, + "epoch": 1.1360000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06434900313615799, + "learning_rate": 1.7881861554051184e-06, + "loss": 0.0011, + "num_tokens": 4933723.0, + "reward": 12.243963241577148, + "reward_std": 4.706650733947754, + "rewards/fitness_reward/mean": 6.951861381530762, + "rewards/fitness_reward/std": 2.006192445755005, + "rewards/kidney_reward/mean": 2.3538095951080322, + "rewards/kidney_reward/std": 1.1005569696426392, + "rewards/length2tails_reward/mean": 0.7174511551856995, + "rewards/length2tails_reward/std": 0.26794394850730896, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.766547918319702, + "rewards/thermo_reward/std": 1.9785237312316895, + "step": 568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.96875, + "completions/mean_terminated_length": 270.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10938147269189358, + "epoch": 1.138, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08531554788351059, + "learning_rate": 1.787396343528012e-06, + "loss": -0.0029, + "num_tokens": 4942426.0, + "reward": 12.526836395263672, + "reward_std": 3.3885457515716553, + "rewards/fitness_reward/mean": 6.98751163482666, + "rewards/fitness_reward/std": 2.113816022872925, + "rewards/kidney_reward/mean": 2.41365909576416, + "rewards/kidney_reward/std": 0.7694591879844666, + "rewards/length2tails_reward/mean": 0.6483283042907715, + "rewards/length2tails_reward/std": 0.34616991877555847, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9608325958251953, + "rewards/thermo_reward/std": 1.7782973051071167, + "step": 569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.1875, + "completions/mean_terminated_length": 271.1875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1083859596401453, + "epoch": 1.1400000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09535173326730728, + "learning_rate": 1.7866052369754613e-06, + "loss": 0.0008, + "num_tokens": 4951136.0, + "reward": 11.748469352722168, + "reward_std": 4.731593608856201, + "rewards/fitness_reward/mean": 6.976041316986084, + "rewards/fitness_reward/std": 1.871350884437561, + "rewards/kidney_reward/mean": 2.1608309745788574, + "rewards/kidney_reward/std": 1.228219747543335, + "rewards/length2tails_reward/mean": 0.6496375799179077, + "rewards/length2tails_reward/std": 0.3468906879425049, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.4466333389282227, + "rewards/thermo_reward/std": 2.20149827003479, + "step": 570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.5, + "completions/mean_terminated_length": 271.5, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10877596493810415, + "epoch": 1.142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08137992769479752, + "learning_rate": 1.7858128370482423e-06, + "loss": -0.0058, + "num_tokens": 4959856.0, + "reward": 12.81321907043457, + "reward_std": 1.8809332847595215, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.6763936281204224, + "rewards/length2tails_reward/std": 0.36004820466041565, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.712632894515991, + "rewards/thermo_reward/std": 1.7976608276367188, + "step": 571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.28125, + "completions/mean_terminated_length": 272.28125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11456997133791447, + "epoch": 1.144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07577666640281677, + "learning_rate": 1.785019145049258e-06, + "loss": -0.0032, + "num_tokens": 4968601.0, + "reward": 10.718204498291016, + "reward_std": 6.012451171875, + "rewards/fitness_reward/mean": 6.324345588684082, + "rewards/fitness_reward/std": 3.2866885662078857, + "rewards/kidney_reward/mean": 2.104271411895752, + "rewards/kidney_reward/std": 1.3013216257095337, + "rewards/length2tails_reward/mean": 0.6984899044036865, + "rewards/length2tails_reward/std": 0.34832921624183655, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.119739532470703, + "rewards/thermo_reward/std": 2.407792091369629, + "step": 572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 270.96875, + "completions/mean_terminated_length": 270.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11358758341521025, + "epoch": 1.146, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06758356094360352, + "learning_rate": 1.7842241622835354e-06, + "loss": 0.002, + "num_tokens": 4977304.0, + "reward": 12.825021743774414, + "reward_std": 1.961801290512085, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4824347496032715, + "rewards/kidney_reward/std": 0.5400055050849915, + "rewards/length2tails_reward/mean": 0.6755943298339844, + "rewards/length2tails_reward/std": 0.3082455098628998, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.813843250274658, + "rewards/thermo_reward/std": 1.6336705684661865, + "step": 573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.25, + "completions/mean_terminated_length": 270.25, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10494345147162676, + "epoch": 1.148, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09451685100793839, + "learning_rate": 1.7834278900582237e-06, + "loss": 0.0028, + "num_tokens": 4985984.0, + "reward": 12.46784496307373, + "reward_std": 4.835677623748779, + "rewards/fitness_reward/mean": 7.028944969177246, + "rewards/fitness_reward/std": 1.8794351816177368, + "rewards/kidney_reward/mean": 2.3567276000976562, + "rewards/kidney_reward/std": 1.2254520654678345, + "rewards/length2tails_reward/mean": 0.6347229480743408, + "rewards/length2tails_reward/std": 0.34789660573005676, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9186999797821045, + "rewards/thermo_reward/std": 1.9678254127502441, + "step": 574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.40625, + "completions/mean_terminated_length": 272.40625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11180542130023241, + "epoch": 1.15, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08369499444961548, + "learning_rate": 1.7826303296825924e-06, + "loss": 0.0009, + "num_tokens": 4994733.0, + "reward": 12.160469055175781, + "reward_std": 4.53151798248291, + "rewards/fitness_reward/mean": 6.954615116119385, + "rewards/fitness_reward/std": 1.9908239841461182, + "rewards/kidney_reward/mean": 2.3553924560546875, + "rewards/kidney_reward/std": 0.9413848519325256, + "rewards/length2tails_reward/mean": 0.7666806578636169, + "rewards/length2tails_reward/std": 0.2807237207889557, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.6737937927246094, + "rewards/thermo_reward/std": 2.1368064880371094, + "step": 575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.6875, + "completions/mean_terminated_length": 270.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09493160434067249, + "epoch": 1.152, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07596372812986374, + "learning_rate": 1.7818314824680298e-06, + "loss": -0.0025, + "num_tokens": 5003427.0, + "reward": 12.270533561706543, + "reward_std": 1.898982048034668, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.517043113708496, + "rewards/kidney_reward/std": 0.2941751182079315, + "rewards/length2tails_reward/mean": 0.5976208448410034, + "rewards/length2tails_reward/std": 0.35658320784568787, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.232543706893921, + "rewards/thermo_reward/std": 1.8339343070983887, + "step": 576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.125, + "completions/mean_terminated_length": 270.125, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "entropy": 0.10535383597016335, + "epoch": 1.154, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16355304419994354, + "learning_rate": 1.7810313497280387e-06, + "loss": 0.0026, + "num_tokens": 5012103.0, + "reward": 12.154855728149414, + "reward_std": 5.096898078918457, + "rewards/fitness_reward/mean": 7.000971794128418, + "rewards/fitness_reward/std": 2.0376741886138916, + "rewards/kidney_reward/mean": 2.354713201522827, + "rewards/kidney_reward/std": 1.2366715669631958, + "rewards/length2tails_reward/mean": 0.6913293600082397, + "rewards/length2tails_reward/std": 0.3156411349773407, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.630038261413574, + "rewards/thermo_reward/std": 2.201185941696167, + "step": 577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.75, + "completions/mean_terminated_length": 270.75, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09658471308648586, + "epoch": 1.156, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06265582889318466, + "learning_rate": 1.7802299327782366e-06, + "loss": -0.0059, + "num_tokens": 5020799.0, + "reward": 11.959808349609375, + "reward_std": 3.974642276763916, + "rewards/fitness_reward/mean": 6.929236888885498, + "rewards/fitness_reward/std": 2.1325860023498535, + "rewards/kidney_reward/mean": 2.4361748695373535, + "rewards/kidney_reward/std": 0.6509256958961487, + "rewards/length2tails_reward/mean": 0.630581259727478, + "rewards/length2tails_reward/std": 0.4034130275249481, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.43133807182312, + "rewards/thermo_reward/std": 2.2508015632629395, + "step": 578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.84375, + "completions/mean_terminated_length": 270.84375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10977741423994303, + "epoch": 1.158, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21476377546787262, + "learning_rate": 1.7794272329363525e-06, + "loss": 0.0012, + "num_tokens": 5029498.0, + "reward": 12.051267623901367, + "reward_std": 4.693270683288574, + "rewards/fitness_reward/mean": 6.910964488983154, + "rewards/fitness_reward/std": 1.9847608804702759, + "rewards/kidney_reward/mean": 2.3280601501464844, + "rewards/kidney_reward/std": 1.00356924533844, + "rewards/length2tails_reward/mean": 0.6109268665313721, + "rewards/length2tails_reward/std": 0.32766100764274597, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.651151180267334, + "rewards/thermo_reward/std": 2.1728739738464355, + "step": 579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.25, + "completions/mean_terminated_length": 271.25, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10096995253115892, + "epoch": 1.16, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11981163173913956, + "learning_rate": 1.7786232515222241e-06, + "loss": -0.0082, + "num_tokens": 5038210.0, + "reward": 11.612472534179688, + "reward_std": 5.185047149658203, + "rewards/fitness_reward/mean": 6.672616004943848, + "rewards/fitness_reward/std": 2.490133762359619, + "rewards/kidney_reward/mean": 2.2824039459228516, + "rewards/kidney_reward/std": 1.100603461265564, + "rewards/length2tails_reward/mean": 0.7170855402946472, + "rewards/length2tails_reward/std": 0.3222355544567108, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.4857451915740967, + "rewards/thermo_reward/std": 2.2294204235076904, + "step": 580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.1875, + "completions/mean_terminated_length": 272.1875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11031336337327957, + "epoch": 1.162, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06516198813915253, + "learning_rate": 1.7778179898577971e-06, + "loss": -0.0036, + "num_tokens": 5046952.0, + "reward": 12.819649696350098, + "reward_std": 2.8847804069519043, + "rewards/fitness_reward/mean": 7.051357746124268, + "rewards/fitness_reward/std": 1.7526482343673706, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7042213678359985, + "rewards/length2tails_reward/std": 0.3321356475353241, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0261082649230957, + "rewards/thermo_reward/std": 1.4118154048919678, + "step": 581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.6875, + "completions/mean_terminated_length": 271.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11428037006407976, + "epoch": 1.164, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09282801300287247, + "learning_rate": 1.7770114492671224e-06, + "loss": -0.0008, + "num_tokens": 5055678.0, + "reward": 12.54432487487793, + "reward_std": 3.2210092544555664, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.324988842010498, + "rewards/kidney_reward/std": 0.9561151266098022, + "rewards/length2tails_reward/mean": 0.7320905327796936, + "rewards/length2tails_reward/std": 0.30599185824394226, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.742450714111328, + "rewards/thermo_reward/std": 2.1522037982940674, + "step": 582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.21875, + "completions/mean_terminated_length": 270.21875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09224184788763523, + "epoch": 1.166, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0960201770067215, + "learning_rate": 1.776203631076353e-06, + "loss": -0.0066, + "num_tokens": 5064357.0, + "reward": 11.877236366271973, + "reward_std": 5.058732509613037, + "rewards/fitness_reward/mean": 6.731173515319824, + "rewards/fitness_reward/std": 2.4796977043151855, + "rewards/kidney_reward/mean": 2.3305821418762207, + "rewards/kidney_reward/std": 0.9962871670722961, + "rewards/length2tails_reward/mean": 0.5928375124931335, + "rewards/length2tails_reward/std": 0.36502107977867126, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.6561970710754395, + "rewards/thermo_reward/std": 2.1421453952789307, + "step": 583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.5625, + "completions/mean_terminated_length": 271.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1109061436727643, + "epoch": 1.168, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09443071484565735, + "learning_rate": 1.7753945366137426e-06, + "loss": 0.0031, + "num_tokens": 5073079.0, + "reward": 11.314729690551758, + "reward_std": 5.843605041503906, + "rewards/fitness_reward/mean": 6.943359851837158, + "rewards/fitness_reward/std": 2.3635756969451904, + "rewards/kidney_reward/mean": 2.131972074508667, + "rewards/kidney_reward/std": 1.5495785474777222, + "rewards/length2tails_reward/mean": 0.687659502029419, + "rewards/length2tails_reward/std": 0.3384109139442444, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.070632219314575, + "rewards/thermo_reward/std": 2.596601724624634, + "step": 584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.4375, + "completions/mean_terminated_length": 272.4375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11310446728020906, + "epoch": 1.17, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11845079064369202, + "learning_rate": 1.7745841672096442e-06, + "loss": -0.0, + "num_tokens": 5081829.0, + "reward": 11.575647354125977, + "reward_std": 5.9788408279418945, + "rewards/fitness_reward/mean": 6.657668590545654, + "rewards/fitness_reward/std": 2.790924310684204, + "rewards/kidney_reward/mean": 2.1905972957611084, + "rewards/kidney_reward/std": 1.4292532205581665, + "rewards/length2tails_reward/mean": 0.7536600828170776, + "rewards/length2tails_reward/std": 0.29540786147117615, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.552015781402588, + "rewards/thermo_reward/std": 2.207545042037964, + "step": 585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.09375, + "completions/mean_terminated_length": 271.09375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10994715616106987, + "epoch": 1.172, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11424069106578827, + "learning_rate": 1.7737725241965067e-06, + "loss": -0.0041, + "num_tokens": 5090536.0, + "reward": 11.595703125, + "reward_std": 5.179896354675293, + "rewards/fitness_reward/mean": 6.970044136047363, + "rewards/fitness_reward/std": 1.9047677516937256, + "rewards/kidney_reward/mean": 2.108436346054077, + "rewards/kidney_reward/std": 1.3753132820129395, + "rewards/length2tails_reward/mean": 0.6399242877960205, + "rewards/length2tails_reward/std": 0.3654977083206177, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.353229522705078, + "rewards/thermo_reward/std": 2.6165781021118164, + "step": 586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.71875, + "completions/mean_terminated_length": 270.71875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10835985839366913, + "epoch": 1.174, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10886748135089874, + "learning_rate": 1.7729596089088727e-06, + "loss": -0.0056, + "num_tokens": 5099231.0, + "reward": 10.739278793334961, + "reward_std": 5.102948188781738, + "rewards/fitness_reward/mean": 6.629342079162598, + "rewards/fitness_reward/std": 2.4391729831695557, + "rewards/kidney_reward/mean": 1.9109983444213867, + "rewards/kidney_reward/std": 1.4604812860488892, + "rewards/length2tails_reward/mean": 0.6606801748275757, + "rewards/length2tails_reward/std": 0.337507039308548, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.0328705310821533, + "rewards/thermo_reward/std": 2.385857582092285, + "step": 587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.09375, + "completions/mean_terminated_length": 270.09375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10174105782061815, + "epoch": 1.176, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08054821938276291, + "learning_rate": 1.7721454226833775e-06, + "loss": -0.0052, + "num_tokens": 5107906.0, + "reward": 12.438180923461914, + "reward_std": 3.1698365211486816, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.2543578147888184, + "rewards/kidney_reward/std": 1.0256778001785278, + "rewards/length2tails_reward/mean": 0.6150473356246948, + "rewards/length2tails_reward/std": 0.34308475255966187, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.6611340045928955, + "rewards/thermo_reward/std": 2.239284038543701, + "step": 588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.28125, + "completions/mean_terminated_length": 271.28125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1017656335607171, + "epoch": 1.178, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1287049800157547, + "learning_rate": 1.7713299668587457e-06, + "loss": -0.004, + "num_tokens": 5116619.0, + "reward": 13.496252059936523, + "reward_std": 1.0705134868621826, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.6924083232879639, + "rewards/length2tails_reward/std": 0.3212348222732544, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.394064426422119, + "rewards/thermo_reward/std": 0.9870204925537109, + "step": 589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.09375, + "completions/mean_terminated_length": 269.09375, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "entropy": 0.10302947741001844, + "epoch": 1.18, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2910614609718323, + "learning_rate": 1.7705132427757892e-06, + "loss": -0.002, + "num_tokens": 5125262.0, + "reward": 12.19694995880127, + "reward_std": 4.453863143920898, + "rewards/fitness_reward/mean": 7.043661117553711, + "rewards/fitness_reward/std": 1.7961864471435547, + "rewards/kidney_reward/mean": 2.3480801582336426, + "rewards/kidney_reward/std": 1.1257485151290894, + "rewards/length2tails_reward/mean": 0.6744387149810791, + "rewards/length2tails_reward/std": 0.29502642154693604, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.637765407562256, + "rewards/thermo_reward/std": 2.0495615005493164, + "step": 590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.375, + "completions/mean_terminated_length": 270.375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.08723223209381104, + "epoch": 1.182, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.062244486063718796, + "learning_rate": 1.769695251777406e-06, + "loss": -0.0048, + "num_tokens": 5133946.0, + "reward": 11.108282089233398, + "reward_std": 6.026435375213623, + "rewards/fitness_reward/mean": 6.404687881469727, + "rewards/fitness_reward/std": 3.023834705352783, + "rewards/kidney_reward/mean": 2.187194585800171, + "rewards/kidney_reward/std": 1.2497555017471313, + "rewards/length2tails_reward/mean": 0.6100426912307739, + "rewards/length2tails_reward/std": 0.38546499609947205, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.355395793914795, + "rewards/thermo_reward/std": 2.284048080444336, + "step": 591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.59375, + "completions/mean_terminated_length": 270.59375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11061716824769974, + "epoch": 1.184, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1302819401025772, + "learning_rate": 1.7688759952085763e-06, + "loss": -0.0039, + "num_tokens": 5142637.0, + "reward": 10.234525680541992, + "reward_std": 5.914739608764648, + "rewards/fitness_reward/mean": 6.1420488357543945, + "rewards/fitness_reward/std": 3.34014630317688, + "rewards/kidney_reward/mean": 1.9530357122421265, + "rewards/kidney_reward/std": 1.4134089946746826, + "rewards/length2tails_reward/mean": 0.6440788507461548, + "rewards/length2tails_reward/std": 0.3247174620628357, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 1.9750324487686157, + "rewards/thermo_reward/std": 2.403918981552124, + "step": 592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.84375, + "completions/mean_terminated_length": 270.84375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10884816199541092, + "epoch": 1.186, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.078577421605587, + "learning_rate": 1.7680554744163623e-06, + "loss": -0.0008, + "num_tokens": 5151336.0, + "reward": 12.893369674682617, + "reward_std": 1.8842889070510864, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4896838665008545, + "rewards/kidney_reward/std": 0.3229711949825287, + "rewards/length2tails_reward/mean": 0.6638084650039673, + "rewards/length2tails_reward/std": 0.34161463379859924, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.876119375228882, + "rewards/thermo_reward/std": 1.6256120204925537, + "step": 593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.3125, + "completions/mean_terminated_length": 272.3125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10069420002400875, + "epoch": 1.188, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07682990282773972, + "learning_rate": 1.7672336907499037e-06, + "loss": -0.0025, + "num_tokens": 5160082.0, + "reward": 12.909473419189453, + "reward_std": 2.9470677375793457, + "rewards/fitness_reward/mean": 7.052633285522461, + "rewards/fitness_reward/std": 1.7454336881637573, + "rewards/kidney_reward/mean": 2.4896838665008545, + "rewards/kidney_reward/std": 0.3229711949825287, + "rewards/length2tails_reward/mean": 0.7239134311676025, + "rewards/length2tails_reward/std": 0.35869836807250977, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.194766044616699, + "rewards/thermo_reward/std": 1.3918113708496094, + "step": 594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 270.65625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11134236957877874, + "epoch": 1.19, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10810114443302155, + "learning_rate": 1.7664106455604174e-06, + "loss": 0.0043, + "num_tokens": 5168775.0, + "reward": 13.395252227783203, + "reward_std": 1.441155195236206, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.6862658262252808, + "rewards/length2tails_reward/std": 0.27184396982192993, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.26632022857666, + "rewards/thermo_reward/std": 1.3266010284423828, + "step": 595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 271.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10387864802032709, + "epoch": 1.192, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06299920380115509, + "learning_rate": 1.7655863402011946e-06, + "loss": -0.0062, + "num_tokens": 5177503.0, + "reward": 12.111298561096191, + "reward_std": 3.954411268234253, + "rewards/fitness_reward/mean": 6.99554443359375, + "rewards/fitness_reward/std": 1.7628074884414673, + "rewards/kidney_reward/mean": 2.238969326019287, + "rewards/kidney_reward/std": 0.9901480674743652, + "rewards/length2tails_reward/mean": 0.6856290698051453, + "rewards/length2tails_reward/std": 0.3055051565170288, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.708221673965454, + "rewards/thermo_reward/std": 1.982776403427124, + "step": 596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.46875, + "completions/mean_terminated_length": 270.46875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11133202258497477, + "epoch": 1.194, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08474539965391159, + "learning_rate": 1.7647607760275985e-06, + "loss": -0.0039, + "num_tokens": 5186190.0, + "reward": 13.106220245361328, + "reward_std": 1.714718222618103, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.5953869819641113, + "rewards/length2tails_reward/std": 0.35357436537742615, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.013735294342041, + "rewards/thermo_reward/std": 1.5857094526290894, + "step": 597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.8125, + "completions/mean_terminated_length": 270.8125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10823120549321175, + "epoch": 1.196, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0782100260257721, + "learning_rate": 1.7639339543970612e-06, + "loss": -0.0021, + "num_tokens": 5194888.0, + "reward": 11.78781509399414, + "reward_std": 4.305365085601807, + "rewards/fitness_reward/mean": 7.188657283782959, + "rewards/fitness_reward/std": 0.5449937582015991, + "rewards/kidney_reward/mean": 2.051370143890381, + "rewards/kidney_reward/std": 1.3266252279281616, + "rewards/length2tails_reward/mean": 0.6458899974822998, + "rewards/length2tails_reward/std": 0.35246819257736206, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.383197784423828, + "rewards/thermo_reward/std": 2.690925121307373, + "step": 598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.84375, + "completions/mean_terminated_length": 272.84375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11543369013816118, + "epoch": 1.198, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06506896018981934, + "learning_rate": 1.7631058766690839e-06, + "loss": -0.0026, + "num_tokens": 5203651.0, + "reward": 11.880399703979492, + "reward_std": 5.43539571762085, + "rewards/fitness_reward/mean": 6.639829635620117, + "rewards/fitness_reward/std": 2.624504566192627, + "rewards/kidney_reward/mean": 2.231795310974121, + "rewards/kidney_reward/std": 1.2990399599075317, + "rewards/length2tails_reward/mean": 0.8059386610984802, + "rewards/length2tails_reward/std": 0.2669394314289093, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.828181266784668, + "rewards/thermo_reward/std": 1.7878499031066895, + "step": 599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 271.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10662618558853865, + "epoch": 1.2, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19384585320949554, + "learning_rate": 1.762276544205232e-06, + "loss": -0.0036, + "num_tokens": 5212379.0, + "reward": 12.483643531799316, + "reward_std": 2.7700603008270264, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.487910747528076, + "rewards/kidney_reward/std": 0.6429896354675293, + "rewards/length2tails_reward/mean": 0.69349205493927, + "rewards/length2tails_reward/std": 0.31337374448776245, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.522707939147949, + "rewards/thermo_reward/std": 2.1695806980133057, + "step": 600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.9375, + "completions/mean_terminated_length": 271.9375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1081048995256424, + "epoch": 1.202, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07642143219709396, + "learning_rate": 1.7614459583691342e-06, + "loss": 0.0001, + "num_tokens": 5221113.0, + "reward": 13.319950103759766, + "reward_std": 1.2846815586090088, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7720909118652344, + "rewards/length2tails_reward/std": 0.27542778849601746, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1824355125427246, + "rewards/thermo_reward/std": 1.187772512435913, + "step": 601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10744845122098923, + "epoch": 1.204, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08143097162246704, + "learning_rate": 1.7606141205264808e-06, + "loss": -0.0027, + "num_tokens": 5229848.0, + "reward": 12.947037696838379, + "reward_std": 2.0212647914886475, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.517043113708496, + "rewards/kidney_reward/std": 0.2941751182079315, + "rewards/length2tails_reward/mean": 0.7308014035224915, + "rewards/length2tails_reward/std": 0.2996106445789337, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8957290649414062, + "rewards/thermo_reward/std": 1.8434280157089233, + "step": 602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.0625, + "completions/mean_terminated_length": 271.0625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10248364508152008, + "epoch": 1.206, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13778772950172424, + "learning_rate": 1.7597810320450197e-06, + "loss": 0.0046, + "num_tokens": 5238554.0, + "reward": 11.7276611328125, + "reward_std": 6.353403091430664, + "rewards/fitness_reward/mean": 6.516511917114258, + "rewards/fitness_reward/std": 2.9167258739471436, + "rewards/kidney_reward/mean": 2.183387517929077, + "rewards/kidney_reward/std": 1.5369446277618408, + "rewards/length2tails_reward/mean": 0.673989474773407, + "rewards/length2tails_reward/std": 0.30423054099082947, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8603644371032715, + "rewards/thermo_reward/std": 2.0831940174102783, + "step": 603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.9375, + "completions/mean_terminated_length": 270.9375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.10588277224451303, + "epoch": 1.208, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05363436043262482, + "learning_rate": 1.7589466942945555e-06, + "loss": -0.0022, + "num_tokens": 5247256.0, + "reward": 13.326116561889648, + "reward_std": 1.268410086631775, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7883108854293823, + "rewards/length2tails_reward/std": 0.2947376072406769, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.214339256286621, + "rewards/thermo_reward/std": 1.1003755331039429, + "step": 604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.21875, + "completions/mean_terminated_length": 272.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0990461278706789, + "epoch": 1.21, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15633194148540497, + "learning_rate": 1.7581111086469473e-06, + "loss": -0.0049, + "num_tokens": 5255999.0, + "reward": 12.139235496520996, + "reward_std": 4.885914325714111, + "rewards/fitness_reward/mean": 6.9905595779418945, + "rewards/fitness_reward/std": 2.0965752601623535, + "rewards/kidney_reward/mean": 2.3520703315734863, + "rewards/kidney_reward/std": 1.1037944555282593, + "rewards/length2tails_reward/mean": 0.6999367475509644, + "rewards/length2tails_reward/std": 0.3327362835407257, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.6266119480133057, + "rewards/thermo_reward/std": 2.315368413925171, + "step": 605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.84375, + "completions/mean_terminated_length": 272.84375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.102089773863554, + "epoch": 1.212, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05936253443360329, + "learning_rate": 1.7572742764761053e-06, + "loss": -0.0042, + "num_tokens": 5264762.0, + "reward": 12.085190773010254, + "reward_std": 4.8999714851379395, + "rewards/fitness_reward/mean": 6.707517147064209, + "rewards/fitness_reward/std": 2.5766422748565674, + "rewards/kidney_reward/mean": 2.2042856216430664, + "rewards/kidney_reward/std": 1.189802646636963, + "rewards/length2tails_reward/mean": 0.7570756673812866, + "rewards/length2tails_reward/std": 0.33921998739242554, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.997680425643921, + "rewards/thermo_reward/std": 1.6423041820526123, + "step": 606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.90625, + "completions/mean_terminated_length": 271.90625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0996887730434537, + "epoch": 1.214, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0814649760723114, + "learning_rate": 1.7564361991579904e-06, + "loss": -0.0057, + "num_tokens": 5273495.0, + "reward": 12.056499481201172, + "reward_std": 3.3672938346862793, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.4296531677246094, + "rewards/kidney_reward/std": 0.5578335523605347, + "rewards/length2tails_reward/mean": 0.6635578870773315, + "rewards/length2tails_reward/std": 0.35432568192481995, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.4074368476867676, + "rewards/thermo_reward/std": 1.871981143951416, + "step": 607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.375, + "completions/mean_terminated_length": 270.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09989192243665457, + "epoch": 1.216, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08182590454816818, + "learning_rate": 1.7555968780706094e-06, + "loss": -0.0097, + "num_tokens": 5282179.0, + "reward": 11.646211624145508, + "reward_std": 4.344832897186279, + "rewards/fitness_reward/mean": 6.744922161102295, + "rewards/fitness_reward/std": 2.4249672889709473, + "rewards/kidney_reward/mean": 2.2594614028930664, + "rewards/kidney_reward/std": 1.0196256637573242, + "rewards/length2tails_reward/mean": 0.5947597026824951, + "rewards/length2tails_reward/std": 0.3878384232521057, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 2.4886021614074707, + "rewards/thermo_reward/std": 1.9756776094436646, + "step": 608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.375, + "completions/mean_terminated_length": 270.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09534382726997137, + "epoch": 1.218, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0723676010966301, + "learning_rate": 1.7547563145940156e-06, + "loss": -0.0072, + "num_tokens": 5290863.0, + "reward": 12.647649765014648, + "reward_std": 3.0074362754821777, + "rewards/fitness_reward/mean": 6.963343620300293, + "rewards/fitness_reward/std": 1.9421277046203613, + "rewards/kidney_reward/mean": 2.508197546005249, + "rewards/kidney_reward/std": 0.5323163270950317, + "rewards/length2tails_reward/mean": 0.6217700839042664, + "rewards/length2tails_reward/std": 0.345920205116272, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0139319896698, + "rewards/thermo_reward/std": 1.527239441871643, + "step": 609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.53125, + "completions/mean_terminated_length": 270.53125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10193273890763521, + "epoch": 1.22, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05713942274451256, + "learning_rate": 1.7539145101103042e-06, + "loss": -0.0058, + "num_tokens": 5299552.0, + "reward": 12.285564422607422, + "reward_std": 3.7375283241271973, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.4296531677246094, + "rewards/kidney_reward/std": 0.5578335523605347, + "rewards/length2tails_reward/mean": 0.6197089552879333, + "rewards/length2tails_reward/std": 0.34006068110466003, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.6408867835998535, + "rewards/thermo_reward/std": 2.2673563957214355, + "step": 610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 273.28125, + "completions/mean_terminated_length": 273.28125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10183690022677183, + "epoch": 1.222, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.061640795320272446, + "learning_rate": 1.753071466003611e-06, + "loss": -0.0057, + "num_tokens": 5308329.0, + "reward": 12.557572364807129, + "reward_std": 3.8539183139801025, + "rewards/fitness_reward/mean": 7.030216693878174, + "rewards/fitness_reward/std": 1.8722407817840576, + "rewards/kidney_reward/mean": 2.438776969909668, + "rewards/kidney_reward/std": 0.6374463438987732, + "rewards/length2tails_reward/mean": 0.7697838544845581, + "rewards/length2tails_reward/std": 0.32206451892852783, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9116005897521973, + "rewards/thermo_reward/std": 1.5846034288406372, + "step": 611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.75, + "completions/mean_terminated_length": 270.75, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.10672392975538969, + "epoch": 1.224, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06281083822250366, + "learning_rate": 1.75222718366011e-06, + "loss": -0.004, + "num_tokens": 5317025.0, + "reward": 11.681337356567383, + "reward_std": 4.935064792633057, + "rewards/fitness_reward/mean": 6.867335319519043, + "rewards/fitness_reward/std": 2.219416379928589, + "rewards/kidney_reward/mean": 2.1714673042297363, + "rewards/kidney_reward/std": 1.151047945022583, + "rewards/length2tails_reward/mean": 0.6821848154067993, + "rewards/length2tails_reward/std": 0.29430484771728516, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.4743170738220215, + "rewards/thermo_reward/std": 2.287351608276367, + "step": 612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.59375, + "completions/mean_terminated_length": 271.59375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11091689392924309, + "epoch": 1.226, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08150680363178253, + "learning_rate": 1.7513816644680124e-06, + "loss": -0.0003, + "num_tokens": 5325748.0, + "reward": 12.91486930847168, + "reward_std": 2.003068208694458, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.650642991065979, + "rewards/kidney_reward/mean": 2.517043113708496, + "rewards/kidney_reward/std": 0.2941751182079315, + "rewards/length2tails_reward/mean": 0.7082419395446777, + "rewards/length2tails_reward/std": 0.33722445368766785, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.980835437774658, + "rewards/thermo_reward/std": 1.5609627962112427, + "step": 613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.5, + "completions/mean_terminated_length": 272.5, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10319388285279274, + "epoch": 1.228, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15667755901813507, + "learning_rate": 1.750534909817561e-06, + "loss": 0.0037, + "num_tokens": 5334500.0, + "reward": 12.450875282287598, + "reward_std": 4.654170989990234, + "rewards/fitness_reward/mean": 7.026939392089844, + "rewards/fitness_reward/std": 1.8907779455184937, + "rewards/kidney_reward/mean": 2.3262205123901367, + "rewards/kidney_reward/std": 1.1446887254714966, + "rewards/length2tails_reward/mean": 0.7560428380966187, + "rewards/length2tails_reward/std": 0.32128700613975525, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9221115112304688, + "rewards/thermo_reward/std": 2.0271341800689697, + "step": 614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.71875, + "completions/mean_terminated_length": 273.71875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10905345156788826, + "epoch": 1.23, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10238877683877945, + "learning_rate": 1.7496869211010314e-06, + "loss": -0.0055, + "num_tokens": 5343291.0, + "reward": 12.982217788696289, + "reward_std": 2.0052247047424316, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.517043113708496, + "rewards/kidney_reward/std": 0.2941751182079315, + "rewards/length2tails_reward/mean": 0.8131692409515381, + "rewards/length2tails_reward/std": 0.2821599245071411, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.922672986984253, + "rewards/thermo_reward/std": 1.738611102104187, + "step": 615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 271.4375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10712381731718779, + "epoch": 1.232, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07008599489927292, + "learning_rate": 1.7488376997127282e-06, + "loss": -0.0033, + "num_tokens": 5352009.0, + "reward": 13.434881210327148, + "reward_std": 0.9187362194061279, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.6913747787475586, + "rewards/length2tails_reward/std": 0.3248680531978607, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3903069496154785, + "rewards/thermo_reward/std": 0.6159141063690186, + "step": 616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 267.96875, + "completions/mean_terminated_length": 267.96875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.11229937896132469, + "epoch": 1.234, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.48756271600723267, + "learning_rate": 1.7479872470489823e-06, + "loss": -0.058, + "num_tokens": 5360616.0, + "reward": 11.352066040039062, + "reward_std": 6.411108493804932, + "rewards/fitness_reward/mean": 6.309316635131836, + "rewards/fitness_reward/std": 3.334545373916626, + "rewards/kidney_reward/mean": 2.0699820518493652, + "rewards/kidney_reward/std": 1.5359529256820679, + "rewards/length2tails_reward/mean": 0.7619932889938354, + "rewards/length2tails_reward/std": 0.31214094161987305, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.796567678451538, + "rewards/thermo_reward/std": 2.15085768699646, + "step": 617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.59375, + "completions/mean_terminated_length": 272.59375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09793747030198574, + "epoch": 1.236, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.074201799929142, + "learning_rate": 1.7471355645081495e-06, + "loss": -0.0065, + "num_tokens": 5369371.0, + "reward": 11.125179290771484, + "reward_std": 6.282215118408203, + "rewards/fitness_reward/mean": 6.593475341796875, + "rewards/fitness_reward/std": 2.824373722076416, + "rewards/kidney_reward/mean": 2.0429749488830566, + "rewards/kidney_reward/std": 1.5781100988388062, + "rewards/length2tails_reward/mean": 0.7332035303115845, + "rewards/length2tails_reward/std": 0.3300551772117615, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.315408229827881, + "rewards/thermo_reward/std": 2.452349901199341, + "step": 618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.09375, + "completions/mean_terminated_length": 271.09375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10096073895692825, + "epoch": 1.238, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08234567940235138, + "learning_rate": 1.7462826534906078e-06, + "loss": -0.0054, + "num_tokens": 5378078.0, + "reward": 12.692102432250977, + "reward_std": 3.2898201942443848, + "rewards/fitness_reward/mean": 7.0526275634765625, + "rewards/fitness_reward/std": 1.7454651594161987, + "rewards/kidney_reward/mean": 2.5099849700927734, + "rewards/kidney_reward/std": 0.5226497054100037, + "rewards/length2tails_reward/mean": 0.6364307403564453, + "rewards/length2tails_reward/std": 0.3661468029022217, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9658467769622803, + "rewards/thermo_reward/std": 1.6006118059158325, + "step": 619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.09375, + "completions/mean_terminated_length": 272.09375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.12040111888200045, + "epoch": 1.24, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11013831198215485, + "learning_rate": 1.7454285153987552e-06, + "loss": -0.0011, + "num_tokens": 5386817.0, + "reward": 13.46084976196289, + "reward_std": 0.7325454950332642, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.774829089641571, + "rewards/length2tails_reward/std": 0.24128678441047668, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3504209518432617, + "rewards/thermo_reward/std": 0.6277978420257568, + "step": 620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.34375, + "completions/mean_terminated_length": 271.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10458789858967066, + "epoch": 1.242, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08587851375341415, + "learning_rate": 1.744573151637007e-06, + "loss": -0.0077, + "num_tokens": 5395532.0, + "reward": 12.471405982971191, + "reward_std": 3.1049158573150635, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.452592670917511, + "rewards/kidney_reward/mean": 2.276317596435547, + "rewards/kidney_reward/std": 1.0352544784545898, + "rewards/length2tails_reward/mean": 0.6744846105575562, + "rewards/length2tails_reward/std": 0.343734472990036, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.7814741134643555, + "rewards/thermo_reward/std": 1.9983136653900146, + "step": 621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.59375, + "completions/mean_terminated_length": 272.59375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10727937240153551, + "epoch": 1.244, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12728402018547058, + "learning_rate": 1.7437165636117939e-06, + "loss": -0.004, + "num_tokens": 5404287.0, + "reward": 13.306811332702637, + "reward_std": 1.1299161911010742, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7561923265457153, + "rewards/length2tails_reward/std": 0.28997352719306946, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2557549476623535, + "rewards/thermo_reward/std": 0.9147730469703674, + "step": 622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.3125, + "completions/mean_terminated_length": 271.3125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10531806293874979, + "epoch": 1.246, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08190429955720901, + "learning_rate": 1.7428587527315596e-06, + "loss": -0.0059, + "num_tokens": 5413001.0, + "reward": 12.839922904968262, + "reward_std": 2.6034655570983887, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.419279098510742, + "rewards/kidney_reward/std": 0.7175997495651245, + "rewards/length2tails_reward/mean": 0.6887946128845215, + "rewards/length2tails_reward/std": 0.33503562211990356, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8905797004699707, + "rewards/thermo_reward/std": 1.9347537755966187, + "step": 623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 270.65625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09187165554612875, + "epoch": 1.248, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08307106792926788, + "learning_rate": 1.741999720406759e-06, + "loss": -0.0078, + "num_tokens": 5421694.0, + "reward": 10.21883773803711, + "reward_std": 7.325775623321533, + "rewards/fitness_reward/mean": 5.902102470397949, + "rewards/fitness_reward/std": 3.7705349922180176, + "rewards/kidney_reward/mean": 1.8158752918243408, + "rewards/kidney_reward/std": 1.697677493095398, + "rewards/length2tails_reward/mean": 0.6075935363769531, + "rewards/length2tails_reward/std": 0.38769808411598206, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.340100049972534, + "rewards/thermo_reward/std": 2.136388063430786, + "step": 624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.34375, + "completions/mean_terminated_length": 269.34375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0934913819655776, + "epoch": 1.25, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09207896888256073, + "learning_rate": 1.7411394680498548e-06, + "loss": 0.0015, + "num_tokens": 5430345.0, + "reward": 12.443601608276367, + "reward_std": 3.250117301940918, + "rewards/fitness_reward/mean": 7.188657760620117, + "rewards/fitness_reward/std": 0.7179933190345764, + "rewards/kidney_reward/mean": 2.3776919841766357, + "rewards/kidney_reward/std": 0.8947495222091675, + "rewards/length2tails_reward/mean": 0.49976906180381775, + "rewards/length2tails_reward/std": 0.38500484824180603, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.7272748947143555, + "rewards/thermo_reward/std": 1.8555277585983276, + "step": 625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 272.1875, + "completions/mean_terminated_length": 272.1875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11000367347151041, + "epoch": 1.252, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07254158705472946, + "learning_rate": 1.7402779970753154e-06, + "loss": -0.0092, + "num_tokens": 5439087.0, + "reward": 11.843017578125, + "reward_std": 3.7989084720611572, + "rewards/fitness_reward/mean": 6.99554443359375, + "rewards/fitness_reward/std": 1.7628074884414673, + "rewards/kidney_reward/mean": 2.329591751098633, + "rewards/kidney_reward/std": 0.8785039782524109, + "rewards/length2tails_reward/mean": 0.7139759659767151, + "rewards/length2tails_reward/std": 0.33523863554000854, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.3464832305908203, + "rewards/thermo_reward/std": 1.9240782260894775, + "step": 626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.09375, + "completions/mean_terminated_length": 273.09375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09588021412491798, + "epoch": 1.254, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0824722871184349, + "learning_rate": 1.7394153088996139e-06, + "loss": -0.0071, + "num_tokens": 5447858.0, + "reward": 12.915249824523926, + "reward_std": 1.940028190612793, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5070910453796387, + "rewards/kidney_reward/std": 0.5383089184761047, + "rewards/length2tails_reward/mean": 0.8070980310440063, + "rewards/length2tails_reward/std": 0.2656845450401306, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8662641048431396, + "rewards/thermo_reward/std": 1.528942584991455, + "step": 627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.28125, + "completions/mean_terminated_length": 272.28125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.09873269777745008, + "epoch": 1.256, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21686367690563202, + "learning_rate": 1.738551404941224e-06, + "loss": -0.0014, + "num_tokens": 5456603.0, + "reward": 11.136815071105957, + "reward_std": 6.538121223449707, + "rewards/fitness_reward/mean": 6.224935531616211, + "rewards/fitness_reward/std": 3.4062118530273438, + "rewards/kidney_reward/mean": 2.1010398864746094, + "rewards/kidney_reward/std": 1.5924327373504639, + "rewards/length2tails_reward/mean": 0.7687352299690247, + "rewards/length2tails_reward/std": 0.28859299421310425, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.6339666843414307, + "rewards/thermo_reward/std": 2.106649160385132, + "step": 628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.6875, + "completions/mean_terminated_length": 271.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09568661265075207, + "epoch": 1.258, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10661531984806061, + "learning_rate": 1.7376862866206186e-06, + "loss": -0.0041, + "num_tokens": 5465329.0, + "reward": 11.562700271606445, + "reward_std": 6.629768371582031, + "rewards/fitness_reward/mean": 6.559837341308594, + "rewards/fitness_reward/std": 2.9754812717437744, + "rewards/kidney_reward/mean": 2.104600429534912, + "rewards/kidney_reward/std": 1.686941385269165, + "rewards/length2tails_reward/mean": 0.7176464796066284, + "rewards/length2tails_reward/std": 0.3340187668800354, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.7264976501464844, + "rewards/thermo_reward/std": 2.2392425537109375, + "step": 629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.59375, + "completions/mean_terminated_length": 271.59375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09941136837005615, + "epoch": 1.26, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06564158946275711, + "learning_rate": 1.7368199553602674e-06, + "loss": -0.0052, + "num_tokens": 5474052.0, + "reward": 12.620133399963379, + "reward_std": 2.574862003326416, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.486088991165161, + "rewards/kidney_reward/std": 0.6529962420463562, + "rewards/length2tails_reward/mean": 0.7097842693328857, + "rewards/length2tails_reward/std": 0.30757448077201843, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.6593899726867676, + "rewards/thermo_reward/std": 1.9267350435256958, + "step": 630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 271.75, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.10511992871761322, + "epoch": 1.262, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0890965685248375, + "learning_rate": 1.735952412584635e-06, + "loss": -0.0028, + "num_tokens": 5482780.0, + "reward": 12.693354606628418, + "reward_std": 2.413665533065796, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.483823776245117, + "rewards/kidney_reward/std": 0.5328060388565063, + "rewards/length2tails_reward/mean": 0.7528306245803833, + "rewards/length2tails_reward/std": 0.249490424990654, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.673062324523926, + "rewards/thermo_reward/std": 2.0427510738372803, + "step": 631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.3125, + "completions/mean_terminated_length": 273.3125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10003827605396509, + "epoch": 1.264, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10966615378856659, + "learning_rate": 1.7350836597201765e-06, + "loss": -0.0008, + "num_tokens": 5491558.0, + "reward": 12.770936965942383, + "reward_std": 4.350057601928711, + "rewards/fitness_reward/mean": 7.010485649108887, + "rewards/fitness_reward/std": 1.9838539361953735, + "rewards/kidney_reward/mean": 2.367945671081543, + "rewards/kidney_reward/std": 1.016711950302124, + "rewards/length2tails_reward/mean": 0.779668927192688, + "rewards/length2tails_reward/std": 0.3014443516731262, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2145395278930664, + "rewards/thermo_reward/std": 1.5500909090042114, + "step": 632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.9375, + "completions/mean_terminated_length": 270.9375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10240908991545439, + "epoch": 1.266, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0859563872218132, + "learning_rate": 1.7342136981953387e-06, + "loss": 0.0013, + "num_tokens": 5500260.0, + "reward": 12.530839920043945, + "reward_std": 3.073016881942749, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.3721253871917725, + "rewards/kidney_reward/std": 0.9938655495643616, + "rewards/length2tails_reward/mean": 0.6619828343391418, + "rewards/length2tails_reward/std": 0.33176231384277344, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.688839912414551, + "rewards/thermo_reward/std": 1.9960085153579712, + "step": 633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.28125, + "completions/mean_terminated_length": 271.28125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10022217966616154, + "epoch": 1.268, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14917409420013428, + "learning_rate": 1.7333425294405545e-06, + "loss": -0.0025, + "num_tokens": 5508973.0, + "reward": 12.465929985046387, + "reward_std": 2.9090192317962646, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.239741802215576, + "rewards/kidney_reward/std": 1.0994014739990234, + "rewards/length2tails_reward/mean": 0.6749511361122131, + "rewards/length2tails_reward/std": 0.3254489600658417, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.6975083351135254, + "rewards/thermo_reward/std": 2.029533624649048, + "step": 634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.46875, + "completions/mean_terminated_length": 271.46875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10164427850395441, + "epoch": 1.27, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07312794029712677, + "learning_rate": 1.7324701548882418e-06, + "loss": -0.0075, + "num_tokens": 5517692.0, + "reward": 13.093249320983887, + "reward_std": 1.8741158246994019, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7035744190216064, + "rewards/length2tails_reward/std": 0.3234020471572876, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.96258544921875, + "rewards/thermo_reward/std": 1.7842705249786377, + "step": 635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.5625, + "completions/mean_terminated_length": 272.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09657697845250368, + "epoch": 1.272, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08342888206243515, + "learning_rate": 1.7315965759728013e-06, + "loss": -0.0092, + "num_tokens": 5526446.0, + "reward": 12.716133117675781, + "reward_std": 2.338914632797241, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.41390061378479, + "rewards/kidney_reward/std": 0.6353449821472168, + "rewards/length2tails_reward/mean": 0.7105174660682678, + "rewards/length2tails_reward/std": 0.35961011052131653, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.76999568939209, + "rewards/thermo_reward/std": 1.8458997011184692, + "step": 636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.28125, + "completions/mean_terminated_length": 272.28125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09856419172137976, + "epoch": 1.274, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07856637984514236, + "learning_rate": 1.7307217941306143e-06, + "loss": -0.0037, + "num_tokens": 5535191.0, + "reward": 12.767711639404297, + "reward_std": 3.3116235733032227, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.457012176513672, + "rewards/kidney_reward/std": 0.5447914004325867, + "rewards/length2tails_reward/mean": 0.8048985004425049, + "rewards/length2tails_reward/std": 0.26381057500839233, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.077155590057373, + "rewards/thermo_reward/std": 1.5934613943099976, + "step": 637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.6875, + "completions/mean_terminated_length": 271.6875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09590794518589973, + "epoch": 1.276, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10152795165777206, + "learning_rate": 1.7298458108000397e-06, + "loss": -0.0103, + "num_tokens": 5543917.0, + "reward": 12.167291641235352, + "reward_std": 3.5369625091552734, + "rewards/fitness_reward/mean": 7.015777587890625, + "rewards/fitness_reward/std": 1.9539211988449097, + "rewards/kidney_reward/mean": 2.3080854415893555, + "rewards/kidney_reward/std": 0.8558576107025146, + "rewards/length2tails_reward/mean": 0.7294836044311523, + "rewards/length2tails_reward/std": 0.3052122890949249, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.670480728149414, + "rewards/thermo_reward/std": 2.1437149047851562, + "step": 638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 268.21875, + "completions/mean_terminated_length": 268.21875, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "entropy": 0.1038109278306365, + "epoch": 1.278, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07300246506929398, + "learning_rate": 1.7289686274214115e-06, + "loss": -0.0044, + "num_tokens": 5552532.0, + "reward": 12.078481674194336, + "reward_std": 4.850605010986328, + "rewards/fitness_reward/mean": 7.006332874298096, + "rewards/fitness_reward/std": 2.0073466300964355, + "rewards/kidney_reward/mean": 2.214355945587158, + "rewards/kidney_reward/std": 1.273207664489746, + "rewards/length2tails_reward/mean": 0.6589057445526123, + "rewards/length2tails_reward/std": 0.337287575006485, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.6919021606445312, + "rewards/thermo_reward/std": 2.128614664077759, + "step": 639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.65625, + "completions/mean_terminated_length": 272.65625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11321705486625433, + "epoch": 1.28, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08817020803689957, + "learning_rate": 1.728090245437038e-06, + "loss": -0.0026, + "num_tokens": 5561289.0, + "reward": 13.19006633758545, + "reward_std": 1.383169174194336, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.7889949083328247, + "rewards/length2tails_reward/std": 0.24998170137405396, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1055798530578613, + "rewards/thermo_reward/std": 1.1738027334213257, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.1875, + "completions/mean_terminated_length": 272.1875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09994053188711405, + "epoch": 1.282, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11618512123823166, + "learning_rate": 1.7272106662911971e-06, + "loss": -0.0028, + "num_tokens": 5570031.0, + "reward": 11.90559196472168, + "reward_std": 3.7982654571533203, + "rewards/fitness_reward/mean": 6.99554443359375, + "rewards/fitness_reward/std": 1.7628074884414673, + "rewards/kidney_reward/mean": 2.3164985179901123, + "rewards/kidney_reward/std": 0.8215380907058716, + "rewards/length2tails_reward/mean": 0.7287619113922119, + "rewards/length2tails_reward/std": 0.32916101813316345, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.4206724166870117, + "rewards/thermo_reward/std": 2.0186150074005127, + "step": 641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.78125, + "completions/mean_terminated_length": 270.78125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10221993364393711, + "epoch": 1.284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17188133299350739, + "learning_rate": 1.7263298914301365e-06, + "loss": 0.0036, + "num_tokens": 5578728.0, + "reward": 12.869743347167969, + "reward_std": 2.4321837425231934, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.401759147644043, + "rewards/kidney_reward/std": 0.7849282622337341, + "rewards/length2tails_reward/mean": 0.6851349472999573, + "rewards/length2tails_reward/std": 0.3111562430858612, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9382848739624023, + "rewards/thermo_reward/std": 1.7535111904144287, + "step": 642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 271.75, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11419525649398565, + "epoch": 1.286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12147994339466095, + "learning_rate": 1.7254479223020683e-06, + "loss": 0.0008, + "num_tokens": 5587456.0, + "reward": 10.761012077331543, + "reward_std": 6.3070526123046875, + "rewards/fitness_reward/mean": 6.451765060424805, + "rewards/fitness_reward/std": 2.736926317214966, + "rewards/kidney_reward/mean": 1.9459079504013062, + "rewards/kidney_reward/std": 1.570999026298523, + "rewards/length2tails_reward/mean": 0.682193398475647, + "rewards/length2tails_reward/std": 0.3580475449562073, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.1951191425323486, + "rewards/thermo_reward/std": 2.5102174282073975, + "step": 643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10469977743923664, + "epoch": 1.288, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10412470996379852, + "learning_rate": 1.7245647603571701e-06, + "loss": -0.0009, + "num_tokens": 5596172.0, + "reward": 11.901531219482422, + "reward_std": 4.787817478179932, + "rewards/fitness_reward/mean": 7.004397392272949, + "rewards/fitness_reward/std": 2.018296003341675, + "rewards/kidney_reward/mean": 2.2795729637145996, + "rewards/kidney_reward/std": 1.190348744392395, + "rewards/length2tails_reward/mean": 0.6881000399589539, + "rewards/length2tails_reward/std": 0.3569899797439575, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.4487509727478027, + "rewards/thermo_reward/std": 2.2187716960906982, + "step": 644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10657103080302477, + "epoch": 1.29, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06384288519620895, + "learning_rate": 1.7236804070475786e-06, + "loss": -0.0035, + "num_tokens": 5604896.0, + "reward": 11.755149841308594, + "reward_std": 4.59630012512207, + "rewards/fitness_reward/mean": 6.955345630645752, + "rewards/fitness_reward/std": 1.986746907234192, + "rewards/kidney_reward/mean": 2.236717700958252, + "rewards/kidney_reward/std": 1.0580967664718628, + "rewards/length2tails_reward/mean": 0.6883218884468079, + "rewards/length2tails_reward/std": 0.3535401523113251, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.394253730773926, + "rewards/thermo_reward/std": 2.3135814666748047, + "step": 645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.84375, + "completions/mean_terminated_length": 272.84375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10480425134301186, + "epoch": 1.292, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.089121013879776, + "learning_rate": 1.7227948638273915e-06, + "loss": -0.0065, + "num_tokens": 5613659.0, + "reward": 12.493803024291992, + "reward_std": 2.4889931678771973, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4569358825683594, + "rewards/kidney_reward/std": 0.5451717972755432, + "rewards/length2tails_reward/mean": 0.7576339244842529, + "rewards/length2tails_reward/std": 0.31424081325531006, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.499917984008789, + "rewards/thermo_reward/std": 2.142576217651367, + "step": 646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.9375, + "completions/mean_terminated_length": 272.9375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.11304991412907839, + "epoch": 1.294, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0759241059422493, + "learning_rate": 1.7219081321526616e-06, + "loss": -0.0063, + "num_tokens": 5622425.0, + "reward": 12.531675338745117, + "reward_std": 4.246157646179199, + "rewards/fitness_reward/mean": 7.038208484649658, + "rewards/fitness_reward/std": 1.8270317316055298, + "rewards/kidney_reward/mean": 2.4285740852355957, + "rewards/kidney_reward/std": 0.9721502661705017, + "rewards/length2tails_reward/mean": 0.7843444347381592, + "rewards/length2tails_reward/std": 0.31813105940818787, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8864593505859375, + "rewards/thermo_reward/std": 1.7416695356369019, + "step": 647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.1875, + "completions/mean_terminated_length": 271.1875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10981705971062183, + "epoch": 1.296, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09525270760059357, + "learning_rate": 1.7210202134813968e-06, + "loss": -0.004, + "num_tokens": 5631135.0, + "reward": 12.248756408691406, + "reward_std": 4.042028903961182, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.452592670917511, + "rewards/kidney_reward/mean": 2.1447324752807617, + "rewards/kidney_reward/std": 1.3550604581832886, + "rewards/length2tails_reward/mean": 0.6968757510185242, + "rewards/length2tails_reward/std": 0.319642037153244, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.6881699562072754, + "rewards/thermo_reward/std": 2.5021729469299316, + "step": 648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.1875, + "completions/mean_terminated_length": 271.1875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09238530695438385, + "epoch": 1.298, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0611051470041275, + "learning_rate": 1.7201311092735562e-06, + "loss": -0.0065, + "num_tokens": 5639845.0, + "reward": 11.624662399291992, + "reward_std": 5.298251152038574, + "rewards/fitness_reward/mean": 6.383270740509033, + "rewards/fitness_reward/std": 3.0991289615631104, + "rewards/kidney_reward/mean": 2.249495506286621, + "rewards/kidney_reward/std": 0.9490680694580078, + "rewards/length2tails_reward/mean": 0.6396400928497314, + "rewards/length2tails_reward/std": 0.3494209349155426, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8279314041137695, + "rewards/thermo_reward/std": 1.9005476236343384, + "step": 649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.3125, + "completions/mean_terminated_length": 273.3125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10085057467222214, + "epoch": 1.3, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08008405566215515, + "learning_rate": 1.719240820991048e-06, + "loss": -0.0085, + "num_tokens": 5648623.0, + "reward": 12.24567699432373, + "reward_std": 3.9568021297454834, + "rewards/fitness_reward/mean": 6.99554443359375, + "rewards/fitness_reward/std": 1.7628074884414673, + "rewards/kidney_reward/mean": 2.364567279815674, + "rewards/kidney_reward/std": 0.8396555185317993, + "rewards/length2tails_reward/mean": 0.7675321102142334, + "rewards/length2tails_reward/std": 0.32164403796195984, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.7088122367858887, + "rewards/thermo_reward/std": 2.098717451095581, + "step": 650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10491770878434181, + "epoch": 1.302, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09322385489940643, + "learning_rate": 1.7183493500977275e-06, + "loss": -0.0081, + "num_tokens": 5657381.0, + "reward": 11.527852058410645, + "reward_std": 4.960941791534424, + "rewards/fitness_reward/mean": 6.880526065826416, + "rewards/fitness_reward/std": 1.8558024168014526, + "rewards/kidney_reward/mean": 2.10536527633667, + "rewards/kidney_reward/std": 1.4573596715927124, + "rewards/length2tails_reward/mean": 0.7661879062652588, + "rewards/length2tails_reward/std": 0.31442517042160034, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.365342378616333, + "rewards/thermo_reward/std": 2.4136619567871094, + "step": 651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11029501724988222, + "epoch": 1.304, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07199763506650925, + "learning_rate": 1.717456698059395e-06, + "loss": -0.0028, + "num_tokens": 5666102.0, + "reward": 12.992130279541016, + "reward_std": 2.2417500019073486, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.452592670917511, + "rewards/kidney_reward/mean": 2.4329657554626465, + "rewards/kidney_reward/std": 0.7647271752357483, + "rewards/length2tails_reward/mean": 0.6814402937889099, + "rewards/length2tails_reward/std": 0.33173587918281555, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.144853115081787, + "rewards/thermo_reward/std": 1.3190016746520996, + "step": 652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.78125, + "completions/mean_terminated_length": 271.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09424859192222357, + "epoch": 1.306, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05711875110864639, + "learning_rate": 1.716562866343792e-06, + "loss": -0.0071, + "num_tokens": 5674831.0, + "reward": 12.767280578613281, + "reward_std": 2.380852222442627, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.501613140106201, + "rewards/kidney_reward/std": 0.5680586695671082, + "rewards/length2tails_reward/mean": 0.6926798820495605, + "rewards/length2tails_reward/std": 0.33684486150741577, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.735213279724121, + "rewards/thermo_reward/std": 1.9477479457855225, + "step": 653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.1875, + "completions/mean_terminated_length": 273.1875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10056958720088005, + "epoch": 1.308, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07294055819511414, + "learning_rate": 1.7156678564206008e-06, + "loss": 0.0007, + "num_tokens": 5683605.0, + "reward": 12.314053535461426, + "reward_std": 4.731460094451904, + "rewards/fitness_reward/mean": 7.004344463348389, + "rewards/fitness_reward/std": 2.0185959339141846, + "rewards/kidney_reward/mean": 2.2853360176086426, + "rewards/kidney_reward/std": 1.1564087867736816, + "rewards/length2tails_reward/mean": 0.8089523315429688, + "rewards/length2tails_reward/std": 0.25128456950187683, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.843478202819824, + "rewards/thermo_reward/std": 2.112743854522705, + "step": 654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.78125, + "completions/mean_terminated_length": 271.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1040391530841589, + "epoch": 1.31, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15061677992343903, + "learning_rate": 1.7147716697614398e-06, + "loss": -0.0033, + "num_tokens": 5692334.0, + "reward": 12.341215133666992, + "reward_std": 3.8126730918884277, + "rewards/fitness_reward/mean": 6.586376190185547, + "rewards/fitness_reward/std": 2.8321077823638916, + "rewards/kidney_reward/mean": 2.4342212677001953, + "rewards/kidney_reward/std": 0.6610793471336365, + "rewards/length2tails_reward/mean": 0.7200385332107544, + "rewards/length2tails_reward/std": 0.31445440649986267, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.148613929748535, + "rewards/thermo_reward/std": 1.3085203170776367, + "step": 655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.0625, + "completions/mean_terminated_length": 272.0625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10377451218664646, + "epoch": 1.312, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14977595210075378, + "learning_rate": 1.713874307839863e-06, + "loss": 0.0007, + "num_tokens": 5701072.0, + "reward": 12.491024017333984, + "reward_std": 2.8138327598571777, + "rewards/fitness_reward/mean": 7.188657283782959, + "rewards/fitness_reward/std": 0.7179933190345764, + "rewards/kidney_reward/mean": 2.4353067874908447, + "rewards/kidney_reward/std": 0.6554340720176697, + "rewards/length2tails_reward/mean": 0.7257574796676636, + "rewards/length2tails_reward/std": 0.3124699890613556, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.694483757019043, + "rewards/thermo_reward/std": 2.06559681892395, + "step": 656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10568260494619608, + "epoch": 1.314, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05766943842172623, + "learning_rate": 1.7129757721313568e-06, + "loss": -0.0062, + "num_tokens": 5709788.0, + "reward": 12.49569320678711, + "reward_std": 3.365452766418457, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.3099136352539062, + "rewards/kidney_reward/std": 0.7445457577705383, + "rewards/length2tails_reward/mean": 0.7206904888153076, + "rewards/length2tails_reward/std": 0.2951766550540924, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9606573581695557, + "rewards/thermo_reward/std": 1.4621469974517822, + "step": 657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10371254477649927, + "epoch": 1.316, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0695052519440651, + "learning_rate": 1.7120760641133367e-06, + "loss": -0.0026, + "num_tokens": 5718523.0, + "reward": 12.284019470214844, + "reward_std": 3.2596521377563477, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.452592670917511, + "rewards/kidney_reward/mean": 2.1595919132232666, + "rewards/kidney_reward/std": 1.2625459432601929, + "rewards/length2tails_reward/mean": 0.7355765700340271, + "rewards/length2tails_reward/std": 0.3051132559776306, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.7047033309936523, + "rewards/thermo_reward/std": 1.9182021617889404, + "step": 658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0999411167576909, + "epoch": 1.318, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10656177997589111, + "learning_rate": 1.7111751852651473e-06, + "loss": -0.0069, + "num_tokens": 5727254.0, + "reward": 11.863635063171387, + "reward_std": 5.591757297515869, + "rewards/fitness_reward/mean": 6.636547088623047, + "rewards/fitness_reward/std": 2.6368114948272705, + "rewards/kidney_reward/mean": 2.2023534774780273, + "rewards/kidney_reward/std": 1.3406885862350464, + "rewards/length2tails_reward/mean": 0.6827249526977539, + "rewards/length2tails_reward/std": 0.36245542764663696, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.856461763381958, + "rewards/thermo_reward/std": 1.8226885795593262, + "step": 659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.5625, + "completions/mean_terminated_length": 271.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10554569493979216, + "epoch": 1.32, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11007196456193924, + "learning_rate": 1.710273137068057e-06, + "loss": -0.0056, + "num_tokens": 5735976.0, + "reward": 13.34743881225586, + "reward_std": 1.5250986814498901, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.6940564513206482, + "rewards/length2tails_reward/std": 0.3022388815879822, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2724452018737793, + "rewards/thermo_reward/std": 1.2907878160476685, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.90625, + "completions/mean_terminated_length": 270.90625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10180851072072983, + "epoch": 1.322, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08281734585762024, + "learning_rate": 1.7093699210052577e-06, + "loss": -0.0059, + "num_tokens": 5744677.0, + "reward": 12.452281951904297, + "reward_std": 3.473741292953491, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.3302063941955566, + "rewards/kidney_reward/std": 0.7663140892982483, + "rewards/length2tails_reward/mean": 0.6239981651306152, + "rewards/length2tails_reward/std": 0.3571479320526123, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.906623125076294, + "rewards/thermo_reward/std": 1.6070977449417114, + "step": 661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.8125, + "completions/mean_terminated_length": 272.8125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10252044629305601, + "epoch": 1.324, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06473301351070404, + "learning_rate": 1.708465538561861e-06, + "loss": -0.0021, + "num_tokens": 5753439.0, + "reward": 12.452521324157715, + "reward_std": 4.619300842285156, + "rewards/fitness_reward/mean": 7.002726078033447, + "rewards/fitness_reward/std": 2.0277509689331055, + "rewards/kidney_reward/mean": 2.3650083541870117, + "rewards/kidney_reward/std": 1.032787799835205, + "rewards/length2tails_reward/mean": 0.7824528813362122, + "rewards/length2tails_reward/std": 0.28417396545410156, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.906541347503662, + "rewards/thermo_reward/std": 2.026707887649536, + "step": 662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.875, + "completions/mean_terminated_length": 271.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09698578109964728, + "epoch": 1.326, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09350641071796417, + "learning_rate": 1.707559991224897e-06, + "loss": -0.0027, + "num_tokens": 5762171.0, + "reward": 11.34994125366211, + "reward_std": 5.410051345825195, + "rewards/fitness_reward/mean": 6.9835734367370605, + "rewards/fitness_reward/std": 2.136094808578491, + "rewards/kidney_reward/mean": 2.084345579147339, + "rewards/kidney_reward/std": 1.4731652736663818, + "rewards/length2tails_reward/mean": 0.7022289633750916, + "rewards/length2tails_reward/std": 0.34806326031684875, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.1117992401123047, + "rewards/thermo_reward/std": 2.696627140045166, + "step": 663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09784791991114616, + "epoch": 1.328, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07207152247428894, + "learning_rate": 1.70665328048331e-06, + "loss": -0.0049, + "num_tokens": 5770895.0, + "reward": 12.333126068115234, + "reward_std": 2.750267267227173, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.3594183921813965, + "rewards/kidney_reward/std": 0.8543692231178284, + "rewards/length2tails_reward/mean": 0.6763886213302612, + "rewards/length2tails_reward/std": 0.33825093507766724, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.444882869720459, + "rewards/thermo_reward/std": 2.0760297775268555, + "step": 664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.34375, + "completions/mean_terminated_length": 273.34375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.10427253041416407, + "epoch": 1.33, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13702253997325897, + "learning_rate": 1.705745407827959e-06, + "loss": 0.0057, + "num_tokens": 5779674.0, + "reward": 11.255656242370605, + "reward_std": 6.599266529083252, + "rewards/fitness_reward/mean": 6.487942695617676, + "rewards/fitness_reward/std": 2.807030439376831, + "rewards/kidney_reward/mean": 1.972966194152832, + "rewards/kidney_reward/std": 1.6536844968795776, + "rewards/length2tails_reward/mean": 0.7791774272918701, + "rewards/length2tails_reward/std": 0.29168036580085754, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.6168298721313477, + "rewards/thermo_reward/std": 2.366076946258545, + "step": 665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10135769378393888, + "epoch": 1.332, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0624493770301342, + "learning_rate": 1.7048363747516117e-06, + "loss": -0.0052, + "num_tokens": 5788390.0, + "reward": 12.884943008422852, + "reward_std": 2.232584238052368, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4621386528015137, + "rewards/kidney_reward/std": 0.6469355821609497, + "rewards/length2tails_reward/mean": 0.6847348213195801, + "rewards/length2tails_reward/std": 0.33357691764831543, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.89314603805542, + "rewards/thermo_reward/std": 1.7530150413513184, + "step": 666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 272.625, + "completions/mean_terminated_length": 272.625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10582635086029768, + "epoch": 1.334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11112461239099503, + "learning_rate": 1.7039261827489448e-06, + "loss": -0.0051, + "num_tokens": 5797146.0, + "reward": 10.896465301513672, + "reward_std": 7.279848575592041, + "rewards/fitness_reward/mean": 6.5933732986450195, + "rewards/fitness_reward/std": 3.0246944427490234, + "rewards/kidney_reward/mean": 2.111262798309326, + "rewards/kidney_reward/std": 1.919937252998352, + "rewards/length2tails_reward/mean": 0.6648454070091248, + "rewards/length2tails_reward/std": 0.3326110243797302, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.025343894958496, + "rewards/thermo_reward/std": 2.9303271770477295, + "step": 667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10798757802695036, + "epoch": 1.336, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10530514270067215, + "learning_rate": 1.7030148333165406e-06, + "loss": -0.0034, + "num_tokens": 5805894.0, + "reward": 13.327153205871582, + "reward_std": 1.369816541671753, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.748895525932312, + "rewards/length2tails_reward/std": 0.31100350618362427, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1645989418029785, + "rewards/thermo_reward/std": 1.3630008697509766, + "step": 668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.90625, + "completions/mean_terminated_length": 271.90625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10481843631714582, + "epoch": 1.338, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11892516911029816, + "learning_rate": 1.702102327952884e-06, + "loss": 0.0018, + "num_tokens": 5814627.0, + "reward": 12.889093399047852, + "reward_std": 2.3293163776397705, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.46122407913208, + "rewards/kidney_reward/std": 0.6518173217773438, + "rewards/length2tails_reward/mean": 0.7261526584625244, + "rewards/length2tails_reward/std": 0.31381848454475403, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9515786170959473, + "rewards/thermo_reward/std": 1.6045955419540405, + "step": 669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.8125, + "completions/mean_terminated_length": 273.8125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.1091268165037036, + "epoch": 1.34, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1734755039215088, + "learning_rate": 1.7011886681583607e-06, + "loss": -0.0011, + "num_tokens": 5823421.0, + "reward": 12.640786170959473, + "reward_std": 3.061244010925293, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.650642991065979, + "rewards/kidney_reward/mean": 2.376332998275757, + "rewards/kidney_reward/std": 0.884257435798645, + "rewards/length2tails_reward/mean": 0.7936819791793823, + "rewards/length2tails_reward/std": 0.2865562438964844, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8389179706573486, + "rewards/thermo_reward/std": 1.8952285051345825, + "step": 670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.9375, + "completions/mean_terminated_length": 271.9375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10370411397889256, + "epoch": 1.342, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11059630662202835, + "learning_rate": 1.7002738554352548e-06, + "loss": -0.0007, + "num_tokens": 5832155.0, + "reward": 11.552849769592285, + "reward_std": 5.098392963409424, + "rewards/fitness_reward/mean": 6.611078262329102, + "rewards/fitness_reward/std": 2.732717752456665, + "rewards/kidney_reward/mean": 2.1995174884796143, + "rewards/kidney_reward/std": 1.1577376127243042, + "rewards/length2tails_reward/mean": 0.724226176738739, + "rewards/length2tails_reward/std": 0.31711849570274353, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.569831371307373, + "rewards/thermo_reward/std": 2.242499589920044, + "step": 671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.90625, + "completions/mean_terminated_length": 272.90625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.10955226328223944, + "epoch": 1.3439999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07638181000947952, + "learning_rate": 1.6993578912877462e-06, + "loss": -0.0004, + "num_tokens": 5840920.0, + "reward": 12.043489456176758, + "reward_std": 4.820775032043457, + "rewards/fitness_reward/mean": 6.899222373962402, + "rewards/fitness_reward/std": 1.9935646057128906, + "rewards/kidney_reward/mean": 2.306669235229492, + "rewards/kidney_reward/std": 1.1659220457077026, + "rewards/length2tails_reward/mean": 0.8214871287345886, + "rewards/length2tails_reward/std": 0.22858977317810059, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.6554486751556396, + "rewards/thermo_reward/std": 2.1794092655181885, + "step": 672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.0, + "completions/mean_terminated_length": 270.0, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09130106586962938, + "epoch": 1.346, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1298259198665619, + "learning_rate": 1.6984407772219073e-06, + "loss": -0.0047, + "num_tokens": 5849592.0, + "reward": 12.563082695007324, + "reward_std": 3.830124616622925, + "rewards/fitness_reward/mean": 6.621788024902344, + "rewards/fitness_reward/std": 2.7010300159454346, + "rewards/kidney_reward/mean": 2.4843716621398926, + "rewards/kidney_reward/std": 0.5299732089042664, + "rewards/length2tails_reward/mean": 0.6012275218963623, + "rewards/length2tails_reward/std": 0.35651201009750366, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2968006134033203, + "rewards/thermo_reward/std": 0.906000018119812, + "step": 673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.75, + "completions/mean_terminated_length": 272.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10582957789301872, + "epoch": 1.3479999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23371417820453644, + "learning_rate": 1.6975225147457024e-06, + "loss": 0.0039, + "num_tokens": 5858352.0, + "reward": 12.074682235717773, + "reward_std": 5.514390468597412, + "rewards/fitness_reward/mean": 6.95579719543457, + "rewards/fitness_reward/std": 2.2932186126708984, + "rewards/kidney_reward/mean": 2.3026914596557617, + "rewards/kidney_reward/std": 1.376892328262329, + "rewards/length2tails_reward/mean": 0.7860455513000488, + "rewards/length2tails_reward/std": 0.2741868793964386, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.6375892162323, + "rewards/thermo_reward/std": 2.3904688358306885, + "step": 674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.4375, + "completions/mean_terminated_length": 270.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09917244967073202, + "epoch": 1.35, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08062262088060379, + "learning_rate": 1.6966031053689827e-06, + "loss": -0.0034, + "num_tokens": 5867038.0, + "reward": 13.281015396118164, + "reward_std": 1.455064058303833, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.650111734867096, + "rewards/length2tails_reward/std": 0.3185828924179077, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.155698299407959, + "rewards/thermo_reward/std": 1.426273226737976, + "step": 675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.375, + "completions/mean_terminated_length": 273.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11477402225136757, + "epoch": 1.3519999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06305685639381409, + "learning_rate": 1.6956825506034863e-06, + "loss": -0.0056, + "num_tokens": 5875818.0, + "reward": 12.572243690490723, + "reward_std": 2.5010037422180176, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.3857381343841553, + "rewards/kidney_reward/std": 0.7801180481910706, + "rewards/length2tails_reward/mean": 0.7871044874191284, + "rewards/length2tails_reward/std": 0.30373579263687134, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.6466102600097656, + "rewards/thermo_reward/std": 1.8559621572494507, + "step": 676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.34375, + "completions/mean_terminated_length": 272.34375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10278540477156639, + "epoch": 1.354, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08388179540634155, + "learning_rate": 1.6947608519628342e-06, + "loss": -0.0005, + "num_tokens": 5884565.0, + "reward": 13.578239440917969, + "reward_std": 0.7047902345657349, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7521419525146484, + "rewards/length2tails_reward/std": 0.29483869671821594, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.470078945159912, + "rewards/thermo_reward/std": 0.5830413699150085, + "step": 677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 274.15625, + "completions/mean_terminated_length": 274.15625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1032821387052536, + "epoch": 1.3559999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10173728317022324, + "learning_rate": 1.693838010962528e-06, + "loss": -0.0028, + "num_tokens": 5893370.0, + "reward": 12.261163711547852, + "reward_std": 4.252288341522217, + "rewards/fitness_reward/mean": 6.9797563552856445, + "rewards/fitness_reward/std": 1.8506603240966797, + "rewards/kidney_reward/mean": 2.328238010406494, + "rewards/kidney_reward/std": 0.9794105291366577, + "rewards/length2tails_reward/mean": 0.8271920680999756, + "rewards/length2tails_reward/std": 0.26984670758247375, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.770451545715332, + "rewards/thermo_reward/std": 1.8565285205841064, + "step": 678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.71875, + "completions/mean_terminated_length": 272.71875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10730791371315718, + "epoch": 1.358, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07551854848861694, + "learning_rate": 1.6929140291199482e-06, + "loss": -0.0015, + "num_tokens": 5902129.0, + "reward": 12.194419860839844, + "reward_std": 4.578545570373535, + "rewards/fitness_reward/mean": 6.999178886413574, + "rewards/fitness_reward/std": 2.0478174686431885, + "rewards/kidney_reward/mean": 2.3207554817199707, + "rewards/kidney_reward/std": 0.9847736358642578, + "rewards/length2tails_reward/mean": 0.747252881526947, + "rewards/length2tails_reward/std": 0.31738021969795227, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.6997597217559814, + "rewards/thermo_reward/std": 2.0591087341308594, + "step": 679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.46875, + "completions/mean_terminated_length": 272.46875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10397808719426394, + "epoch": 1.3599999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12071414291858673, + "learning_rate": 1.6919889079543505e-06, + "loss": 0.002, + "num_tokens": 5910880.0, + "reward": 11.56614875793457, + "reward_std": 6.464588642120361, + "rewards/fitness_reward/mean": 6.585282802581787, + "rewards/fitness_reward/std": 2.8306221961975098, + "rewards/kidney_reward/mean": 2.123476505279541, + "rewards/kidney_reward/std": 1.6230067014694214, + "rewards/length2tails_reward/mean": 0.752088189125061, + "rewards/length2tails_reward/std": 0.2834548056125641, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.6821799278259277, + "rewards/thermo_reward/std": 2.190772533416748, + "step": 680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.8125, + "completions/mean_terminated_length": 271.8125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.11202096100896597, + "epoch": 1.362, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2075084149837494, + "learning_rate": 1.6910626489868648e-06, + "loss": 0.0027, + "num_tokens": 5919610.0, + "reward": 12.229260444641113, + "reward_std": 4.530606269836426, + "rewards/fitness_reward/mean": 6.927616119384766, + "rewards/fitness_reward/std": 1.896013855934143, + "rewards/kidney_reward/mean": 2.3120498657226562, + "rewards/kidney_reward/std": 1.084660291671753, + "rewards/length2tails_reward/mean": 0.7117540836334229, + "rewards/length2tails_reward/std": 0.3070603907108307, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8184189796447754, + "rewards/thermo_reward/std": 1.822995901107788, + "step": 681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 273.6875, + "completions/mean_terminated_length": 273.6875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.12129755318164825, + "epoch": 1.3639999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1131335198879242, + "learning_rate": 1.69013525374049e-06, + "loss": -0.0035, + "num_tokens": 5928400.0, + "reward": 12.55710220336914, + "reward_std": 2.2411751747131348, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4991579055786133, + "rewards/kidney_reward/std": 0.581434965133667, + "rewards/length2tails_reward/mean": 0.7221885919570923, + "rewards/length2tails_reward/std": 0.29065218567848206, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.5245399475097656, + "rewards/thermo_reward/std": 1.9252128601074219, + "step": 682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10431716963648796, + "epoch": 1.366, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13056468963623047, + "learning_rate": 1.6892067237400957e-06, + "loss": -0.0037, + "num_tokens": 5937116.0, + "reward": 12.706653594970703, + "reward_std": 2.178776502609253, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.3893089294433594, + "rewards/kidney_reward/std": 0.7356569170951843, + "rewards/length2tails_reward/mean": 0.6970548629760742, + "rewards/length2tails_reward/std": 0.3222173750400543, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.786454200744629, + "rewards/thermo_reward/std": 1.6334350109100342, + "step": 683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.09375, + "completions/mean_terminated_length": 273.09375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10924753546714783, + "epoch": 1.3679999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1930069625377655, + "learning_rate": 1.688277060512416e-06, + "loss": -0.0012, + "num_tokens": 5945887.0, + "reward": 12.911041259765625, + "reward_std": 2.489048957824707, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.427684783935547, + "rewards/kidney_reward/std": 0.6952542662620544, + "rewards/length2tails_reward/mean": 0.7463508248329163, + "rewards/length2tails_reward/std": 0.34181684255599976, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.005045175552368, + "rewards/thermo_reward/std": 1.6862391233444214, + "step": 684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.21875, + "completions/mean_terminated_length": 273.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10252848453819752, + "epoch": 1.37, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10934145003557205, + "learning_rate": 1.687346265586048e-06, + "loss": -0.006, + "num_tokens": 5954662.0, + "reward": 12.280689239501953, + "reward_std": 3.0210068225860596, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.1778016090393066, + "rewards/kidney_reward/std": 0.9150532484054565, + "rewards/length2tails_reward/mean": 0.7558243870735168, + "rewards/length2tails_reward/std": 0.33266839385032654, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.623629093170166, + "rewards/thermo_reward/std": 2.0062804222106934, + "step": 685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09957588836550713, + "epoch": 1.3719999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09618765115737915, + "learning_rate": 1.6864143404914504e-06, + "loss": -0.0034, + "num_tokens": 5963410.0, + "reward": 13.563940048217773, + "reward_std": 0.6711642742156982, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7344149351119995, + "rewards/length2tails_reward/std": 0.3277231454849243, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4301929473876953, + "rewards/thermo_reward/std": 0.6010707020759583, + "step": 686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09993568249046803, + "epoch": 1.374, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11227070540189743, + "learning_rate": 1.6854812867609395e-06, + "loss": -0.0058, + "num_tokens": 5972158.0, + "reward": 12.18494701385498, + "reward_std": 3.8810713291168213, + "rewards/fitness_reward/mean": 6.744510650634766, + "rewards/fitness_reward/std": 2.426586151123047, + "rewards/kidney_reward/mean": 2.402294158935547, + "rewards/kidney_reward/std": 0.5692219734191895, + "rewards/length2tails_reward/mean": 0.7263046503067017, + "rewards/length2tails_reward/std": 0.34151479601860046, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8655123710632324, + "rewards/thermo_reward/std": 1.7246884107589722, + "step": 687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.28125, + "completions/mean_terminated_length": 272.28125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10269459336996078, + "epoch": 1.376, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07501363009214401, + "learning_rate": 1.6845471059286886e-06, + "loss": -0.0071, + "num_tokens": 5980903.0, + "reward": 12.991816520690918, + "reward_std": 2.287548065185547, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.485689640045166, + "rewards/kidney_reward/std": 0.6551914811134338, + "rewards/length2tails_reward/mean": 0.7599628567695618, + "rewards/length2tails_reward/std": 0.2903619408607483, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.026454448699951, + "rewards/thermo_reward/std": 1.536690354347229, + "step": 688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.71875, + "completions/mean_terminated_length": 273.71875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1052359938621521, + "epoch": 1.3780000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08972465991973877, + "learning_rate": 1.6836117995307225e-06, + "loss": -0.0019, + "num_tokens": 5989694.0, + "reward": 13.074825286865234, + "reward_std": 1.664375901222229, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.517043113708496, + "rewards/kidney_reward/std": 0.2941751182079315, + "rewards/length2tails_reward/mean": 0.8253493309020996, + "rewards/length2tails_reward/std": 0.24605530500411987, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0140621662139893, + "rewards/thermo_reward/std": 1.5036872625350952, + "step": 689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.10593629349023104, + "epoch": 1.38, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1128479540348053, + "learning_rate": 1.682675369104918e-06, + "loss": -0.0049, + "num_tokens": 5998458.0, + "reward": 12.63668441772461, + "reward_std": 2.3853824138641357, + "rewards/fitness_reward/mean": 7.131148338317871, + "rewards/fitness_reward/std": 0.7751544713973999, + "rewards/kidney_reward/mean": 2.4066221714019775, + "rewards/kidney_reward/std": 0.672130823135376, + "rewards/length2tails_reward/mean": 0.7614470720291138, + "rewards/length2tails_reward/std": 0.30095815658569336, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.922769546508789, + "rewards/thermo_reward/std": 1.4726288318634033, + "step": 690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.8125, + "completions/mean_terminated_length": 270.8125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1032063439488411, + "epoch": 1.3820000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09008792787790298, + "learning_rate": 1.6817378161909995e-06, + "loss": -0.0049, + "num_tokens": 6007156.0, + "reward": 12.718839645385742, + "reward_std": 3.3231112957000732, + "rewards/fitness_reward/mean": 6.999629974365234, + "rewards/fitness_reward/std": 2.04526424407959, + "rewards/kidney_reward/mean": 2.4188828468322754, + "rewards/kidney_reward/std": 0.7191933393478394, + "rewards/length2tails_reward/mean": 0.7066649198532104, + "rewards/length2tails_reward/std": 0.30196088552474976, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.129659652709961, + "rewards/thermo_reward/std": 1.5934593677520752, + "step": 691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 272.125, + "completions/mean_terminated_length": 272.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10248612426221371, + "epoch": 1.384, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05736776441335678, + "learning_rate": 1.6807991423305372e-06, + "loss": -0.0026, + "num_tokens": 6015896.0, + "reward": 13.128643035888672, + "reward_std": 1.5915932655334473, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.7266826629638672, + "rewards/length2tails_reward/std": 0.3352401554584503, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.050387382507324, + "rewards/thermo_reward/std": 1.3753472566604614, + "step": 692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.9375, + "completions/mean_terminated_length": 272.9375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10259763896465302, + "epoch": 1.3860000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30000802874565125, + "learning_rate": 1.6798593490669444e-06, + "loss": 0.003, + "num_tokens": 6024662.0, + "reward": 12.33034896850586, + "reward_std": 5.298914909362793, + "rewards/fitness_reward/mean": 6.8621296882629395, + "rewards/fitness_reward/std": 2.247586488723755, + "rewards/kidney_reward/mean": 2.294316291809082, + "rewards/kidney_reward/std": 1.4534074068069458, + "rewards/length2tails_reward/mean": 0.7780168056488037, + "rewards/length2tails_reward/std": 0.3289325535297394, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.996100902557373, + "rewards/thermo_reward/std": 1.8078289031982422, + "step": 693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.65625, + "completions/mean_terminated_length": 272.65625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.11150118336081505, + "epoch": 1.388, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1403384655714035, + "learning_rate": 1.678918437945475e-06, + "loss": -0.0065, + "num_tokens": 6033419.0, + "reward": 12.41234016418457, + "reward_std": 4.395359992980957, + "rewards/fitness_reward/mean": 7.020390510559082, + "rewards/fitness_reward/std": 1.9278241395950317, + "rewards/kidney_reward/mean": 2.3808698654174805, + "rewards/kidney_reward/std": 1.0912655591964722, + "rewards/length2tails_reward/mean": 0.7625837922096252, + "rewards/length2tails_reward/std": 0.2701271176338196, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8348212242126465, + "rewards/thermo_reward/std": 1.7445740699768066, + "step": 694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.40625, + "completions/mean_terminated_length": 272.40625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10476927366107702, + "epoch": 1.3900000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08929768949747086, + "learning_rate": 1.677976410513221e-06, + "loss": -0.002, + "num_tokens": 6042168.0, + "reward": 13.310680389404297, + "reward_std": 1.8905342817306519, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.5304651260375977, + "rewards/kidney_reward/std": 0.5431409478187561, + "rewards/length2tails_reward/mean": 0.774996280670166, + "rewards/length2tails_reward/std": 0.2649703621864319, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.299039602279663, + "rewards/thermo_reward/std": 1.1939067840576172, + "step": 695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1053413487970829, + "epoch": 1.392, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12015575915575027, + "learning_rate": 1.6770332683191095e-06, + "loss": -0.0012, + "num_tokens": 6050926.0, + "reward": 11.121023178100586, + "reward_std": 6.33602237701416, + "rewards/fitness_reward/mean": 6.40739631652832, + "rewards/fitness_reward/std": 3.015723943710327, + "rewards/kidney_reward/mean": 1.9892921447753906, + "rewards/kidney_reward/std": 1.7127926349639893, + "rewards/length2tails_reward/mean": 0.750725269317627, + "rewards/length2tails_reward/std": 0.3456234037876129, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.549262523651123, + "rewards/thermo_reward/std": 2.3009464740753174, + "step": 696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.28125, + "completions/mean_terminated_length": 271.28125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1106234323233366, + "epoch": 1.3940000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10471359640359879, + "learning_rate": 1.6760890129139012e-06, + "loss": -0.0004, + "num_tokens": 6059639.0, + "reward": 13.341649055480957, + "reward_std": 1.8602107763290405, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.534843683242798, + "rewards/kidney_reward/std": 0.5183712840080261, + "rewards/length2tails_reward/mean": 0.6905333399772644, + "rewards/length2tails_reward/std": 0.3257606327533722, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.276567220687866, + "rewards/thermo_reward/std": 1.4345035552978516, + "step": 697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 271.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10321180988103151, + "epoch": 1.396, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08855379372835159, + "learning_rate": 1.6751436458501868e-06, + "loss": -0.006, + "num_tokens": 6068350.0, + "reward": 12.729233741760254, + "reward_std": 2.860873222351074, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.401076316833496, + "rewards/kidney_reward/std": 0.9794626235961914, + "rewards/length2tails_reward/mean": 0.6787518262863159, + "rewards/length2tails_reward/std": 0.3055427372455597, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.7990972995758057, + "rewards/thermo_reward/std": 2.0861966609954834, + "step": 698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.09375, + "completions/mean_terminated_length": 273.09375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10413977596908808, + "epoch": 1.3980000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.36088234186172485, + "learning_rate": 1.6741971686823849e-06, + "loss": -0.0063, + "num_tokens": 6077121.0, + "reward": 12.361455917358398, + "reward_std": 4.51021146774292, + "rewards/fitness_reward/mean": 6.9834794998168945, + "rewards/fitness_reward/std": 1.8299298286437988, + "rewards/kidney_reward/mean": 2.290621280670166, + "rewards/kidney_reward/std": 1.148695945739746, + "rewards/length2tails_reward/mean": 0.8251867294311523, + "rewards/length2tails_reward/std": 0.23498550057411194, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.904836654663086, + "rewards/thermo_reward/std": 1.8687138557434082, + "step": 699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.53125, + "completions/mean_terminated_length": 273.53125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09921541810035706, + "epoch": 1.4, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07918018102645874, + "learning_rate": 1.6732495829667395e-06, + "loss": -0.0057, + "num_tokens": 6085906.0, + "reward": 12.201481819152832, + "reward_std": 4.22837495803833, + "rewards/fitness_reward/mean": 6.993868827819824, + "rewards/fitness_reward/std": 1.7721247673034668, + "rewards/kidney_reward/mean": 2.2571630477905273, + "rewards/kidney_reward/std": 1.0439294576644897, + "rewards/length2tails_reward/mean": 0.8327365517616272, + "rewards/length2tails_reward/std": 0.23388439416885376, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.7671761512756348, + "rewards/thermo_reward/std": 1.8673089742660522, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09286362864077091, + "epoch": 1.4020000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07303542643785477, + "learning_rate": 1.6723008902613168e-06, + "loss": -0.0066, + "num_tokens": 6094622.0, + "reward": 12.563545227050781, + "reward_std": 3.932305335998535, + "rewards/fitness_reward/mean": 6.680659294128418, + "rewards/fitness_reward/std": 2.6905438899993896, + "rewards/kidney_reward/mean": 2.4843716621398926, + "rewards/kidney_reward/std": 0.5299732089042664, + "rewards/length2tails_reward/mean": 0.6491783857345581, + "rewards/length2tails_reward/std": 0.34321579337120056, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2335963249206543, + "rewards/thermo_reward/std": 1.3015984296798706, + "step": 701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09672014974057674, + "epoch": 1.404, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09479635953903198, + "learning_rate": 1.6713510921260038e-06, + "loss": -0.0057, + "num_tokens": 6103366.0, + "reward": 11.757293701171875, + "reward_std": 4.412098407745361, + "rewards/fitness_reward/mean": 6.99554443359375, + "rewards/fitness_reward/std": 1.7628074884414673, + "rewards/kidney_reward/mean": 2.1463756561279297, + "rewards/kidney_reward/std": 1.3339651823043823, + "rewards/length2tails_reward/mean": 0.7187809944152832, + "rewards/length2tails_reward/std": 0.3444777727127075, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.443495035171509, + "rewards/thermo_reward/std": 2.35288405418396, + "step": 702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.8125, + "completions/mean_terminated_length": 271.8125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.10314061678946018, + "epoch": 1.4060000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08774527162313461, + "learning_rate": 1.670400190122505e-06, + "loss": -0.0032, + "num_tokens": 6112096.0, + "reward": 12.55300521850586, + "reward_std": 2.4551267623901367, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.442668914794922, + "rewards/kidney_reward/std": 0.6173912882804871, + "rewards/length2tails_reward/mean": 0.7243757843971252, + "rewards/length2tails_reward/std": 0.2981627285480499, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.634222984313965, + "rewards/thermo_reward/std": 1.8824540376663208, + "step": 703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 270.65625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0946558965370059, + "epoch": 1.408, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10838892310857773, + "learning_rate": 1.66944818581434e-06, + "loss": 0.0017, + "num_tokens": 6120789.0, + "reward": 12.890625, + "reward_std": 2.3468823432922363, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.4862608909606934, + "rewards/kidney_reward/std": 0.6520509719848633, + "rewards/length2tails_reward/mean": 0.6216571927070618, + "rewards/length2tails_reward/std": 0.3502180874347687, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9385228157043457, + "rewards/thermo_reward/std": 1.6677196025848389, + "step": 704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10168962553143501, + "epoch": 1.41, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0808025449514389, + "learning_rate": 1.6684950807668402e-06, + "loss": -0.0036, + "num_tokens": 6129505.0, + "reward": 13.156676292419434, + "reward_std": 1.6391278505325317, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.6781437397003174, + "rewards/length2tails_reward/std": 0.31954437494277954, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0832748413085938, + "rewards/thermo_reward/std": 1.4239745140075684, + "step": 705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.15625, + "completions/mean_terminated_length": 271.15625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09764528181403875, + "epoch": 1.412, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07843001186847687, + "learning_rate": 1.6675408765471479e-06, + "loss": -0.0046, + "num_tokens": 6138214.0, + "reward": 13.312134742736816, + "reward_std": 1.5263793468475342, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.6829698085784912, + "rewards/length2tails_reward/std": 0.3603144586086273, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2108922004699707, + "rewards/thermo_reward/std": 1.381186842918396, + "step": 706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 271.4375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10325187537819147, + "epoch": 1.414, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12241241335868835, + "learning_rate": 1.6665855747242117e-06, + "loss": 0.0035, + "num_tokens": 6146932.0, + "reward": 11.462259292602539, + "reward_std": 6.049312591552734, + "rewards/fitness_reward/mean": 6.809179782867432, + "rewards/fitness_reward/std": 2.2334835529327393, + "rewards/kidney_reward/mean": 2.035914897918701, + "rewards/kidney_reward/std": 1.6355434656143188, + "rewards/length2tails_reward/mean": 0.6988885402679443, + "rewards/length2tails_reward/std": 0.29368865489959717, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.4472761154174805, + "rewards/thermo_reward/std": 2.5866122245788574, + "step": 707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.34375, + "completions/mean_terminated_length": 271.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0953157301992178, + "epoch": 1.416, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06981104612350464, + "learning_rate": 1.6656291768687855e-06, + "loss": -0.007, + "num_tokens": 6155647.0, + "reward": 11.916261672973633, + "reward_std": 3.774662971496582, + "rewards/fitness_reward/mean": 6.99554443359375, + "rewards/fitness_reward/std": 1.7628074884414673, + "rewards/kidney_reward/mean": 2.2872872352600098, + "rewards/kidney_reward/std": 0.833087682723999, + "rewards/length2tails_reward/mean": 0.6991365551948547, + "rewards/length2tails_reward/std": 0.3266415297985077, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.4635162353515625, + "rewards/thermo_reward/std": 1.992389440536499, + "step": 708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 267.15625, + "completions/mean_terminated_length": 267.15625, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.11413798667490482, + "epoch": 1.418, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5822765231132507, + "learning_rate": 1.6646716845534256e-06, + "loss": -0.0898, + "num_tokens": 6164228.0, + "reward": 11.752373695373535, + "reward_std": 6.4400410652160645, + "rewards/fitness_reward/mean": 6.625027179718018, + "rewards/fitness_reward/std": 2.897608995437622, + "rewards/kidney_reward/mean": 2.1439552307128906, + "rewards/kidney_reward/std": 1.5757523775100708, + "rewards/length2tails_reward/mean": 0.8399752974510193, + "rewards/length2tails_reward/std": 0.22871538996696472, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.799393653869629, + "rewards/thermo_reward/std": 2.2180750370025635, + "step": 709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.15625, + "completions/mean_terminated_length": 272.15625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11446765344589949, + "epoch": 1.42, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1195162832736969, + "learning_rate": 1.663713099352487e-06, + "loss": -0.0033, + "num_tokens": 6172969.0, + "reward": 10.830946922302246, + "reward_std": 6.950724124908447, + "rewards/fitness_reward/mean": 5.919102668762207, + "rewards/fitness_reward/std": 3.883455276489258, + "rewards/kidney_reward/mean": 2.1333961486816406, + "rewards/kidney_reward/std": 1.3962254524230957, + "rewards/length2tails_reward/mean": 0.741581916809082, + "rewards/length2tails_reward/std": 0.30954524874687195, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.60429048538208, + "rewards/thermo_reward/std": 2.073528528213501, + "step": 710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.125, + "completions/mean_terminated_length": 273.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10294345114380121, + "epoch": 1.422, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09308013319969177, + "learning_rate": 1.662753422842123e-06, + "loss": 0.0011, + "num_tokens": 6181741.0, + "reward": 12.426822662353516, + "reward_std": 4.511376857757568, + "rewards/fitness_reward/mean": 7.002735137939453, + "rewards/fitness_reward/std": 2.027700424194336, + "rewards/kidney_reward/mean": 2.378523826599121, + "rewards/kidney_reward/std": 0.9589655995368958, + "rewards/length2tails_reward/mean": 0.8055219054222107, + "rewards/length2tails_reward/std": 0.2933149039745331, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.865011692047119, + "rewards/thermo_reward/std": 1.9666255712509155, + "step": 711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.625, + "completions/mean_terminated_length": 270.625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.0997420297935605, + "epoch": 1.424, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08755825459957123, + "learning_rate": 1.6617926566002795e-06, + "loss": -0.0001, + "num_tokens": 6190433.0, + "reward": 11.679386138916016, + "reward_std": 5.396921634674072, + "rewards/fitness_reward/mean": 6.621771335601807, + "rewards/fitness_reward/std": 2.6982581615448, + "rewards/kidney_reward/mean": 2.2178728580474854, + "rewards/kidney_reward/std": 1.2350701093673706, + "rewards/length2tails_reward/mean": 0.6481842994689941, + "rewards/length2tails_reward/std": 0.310602068901062, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.6749234199523926, + "rewards/thermo_reward/std": 2.1603336334228516, + "step": 712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 274.21875, + "completions/mean_terminated_length": 274.21875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10170354135334492, + "epoch": 1.426, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10588368028402328, + "learning_rate": 1.660830802206696e-06, + "loss": 0.005, + "num_tokens": 6199240.0, + "reward": 12.550549507141113, + "reward_std": 3.580357313156128, + "rewards/fitness_reward/mean": 7.188657283782959, + "rewards/fitness_reward/std": 0.7179933190345764, + "rewards/kidney_reward/mean": 2.2572848796844482, + "rewards/kidney_reward/std": 1.0889778137207031, + "rewards/length2tails_reward/mean": 0.7912466526031494, + "rewards/length2tails_reward/std": 0.32440313696861267, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.925483226776123, + "rewards/thermo_reward/std": 1.9545879364013672, + "step": 713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.8125, + "completions/mean_terminated_length": 272.8125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10243859142065048, + "epoch": 1.428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07144709676504135, + "learning_rate": 1.6598678612429e-06, + "loss": -0.0089, + "num_tokens": 6208002.0, + "reward": 12.510882377624512, + "reward_std": 2.848557949066162, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.4377520084381104, + "rewards/kidney_reward/std": 0.7783918380737305, + "rewards/length2tails_reward/mean": 0.7538530826568604, + "rewards/length2tails_reward/std": 0.30126190185546875, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.594069480895996, + "rewards/thermo_reward/std": 1.9884313344955444, + "step": 714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.84375, + "completions/mean_terminated_length": 272.84375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09988569654524326, + "epoch": 1.43, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10599660128355026, + "learning_rate": 1.658903835292206e-06, + "loss": -0.0001, + "num_tokens": 6216765.0, + "reward": 13.246397018432617, + "reward_std": 1.3967713117599487, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.7891009449958801, + "rewards/length2tails_reward/std": 0.2741408050060272, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1618995666503906, + "rewards/thermo_reward/std": 1.2669503688812256, + "step": 715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11686312593519688, + "epoch": 1.432, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09374818205833435, + "learning_rate": 1.6579387259397126e-06, + "loss": -0.0041, + "num_tokens": 6225513.0, + "reward": 13.120552062988281, + "reward_std": 1.6873857975006104, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4534711837768555, + "rewards/kidney_reward/std": 0.8334063291549683, + "rewards/length2tails_reward/mean": 0.7929362058639526, + "rewards/length2tails_reward/std": 0.2758377194404602, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1266021728515625, + "rewards/thermo_reward/std": 1.4102927446365356, + "step": 716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.0, + "completions/mean_terminated_length": 270.0, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0957229109480977, + "epoch": 1.434, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08483195304870605, + "learning_rate": 1.6569725347722993e-06, + "loss": -0.0063, + "num_tokens": 6234185.0, + "reward": 12.214631080627441, + "reward_std": 2.9265432357788086, + "rewards/fitness_reward/mean": 7.188657760620117, + "rewards/fitness_reward/std": 0.7179933190345764, + "rewards/kidney_reward/mean": 2.4142332077026367, + "rewards/kidney_reward/std": 0.766402006149292, + "rewards/length2tails_reward/mean": 0.5923624038696289, + "rewards/length2tails_reward/std": 0.33820590376853943, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.452504873275757, + "rewards/thermo_reward/std": 2.009061098098755, + "step": 717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09740118542686105, + "epoch": 1.436, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10787434875965118, + "learning_rate": 1.656005263378625e-06, + "loss": -0.0034, + "num_tokens": 6242916.0, + "reward": 12.084197998046875, + "reward_std": 5.40502405166626, + "rewards/fitness_reward/mean": 6.607909202575684, + "rewards/fitness_reward/std": 2.749068021774292, + "rewards/kidney_reward/mean": 2.319880247116089, + "rewards/kidney_reward/std": 1.0164239406585693, + "rewards/length2tails_reward/mean": 0.7005301117897034, + "rewards/length2tails_reward/std": 0.35030439496040344, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.986356258392334, + "rewards/thermo_reward/std": 1.8370975255966187, + "step": 718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.0, + "completions/mean_terminated_length": 270.0, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09824726451188326, + "epoch": 1.438, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16927476227283478, + "learning_rate": 1.6550369133491247e-06, + "loss": -0.0026, + "num_tokens": 6251588.0, + "reward": 12.84244155883789, + "reward_std": 2.6119320392608643, + "rewards/fitness_reward/mean": 7.014434337615967, + "rewards/fitness_reward/std": 1.961518406867981, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.6008568406105042, + "rewards/length2tails_reward/std": 0.34025225043296814, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.096160411834717, + "rewards/thermo_reward/std": 1.3524459600448608, + "step": 719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.5625, + "completions/mean_terminated_length": 271.5625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10022869799286127, + "epoch": 1.44, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09501426666975021, + "learning_rate": 1.654067486276006e-06, + "loss": -0.0013, + "num_tokens": 6260310.0, + "reward": 13.449679374694824, + "reward_std": 1.3080042600631714, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.507431745529175, + "rewards/kidney_reward/std": 0.5364625453948975, + "rewards/length2tails_reward/mean": 0.7026489973068237, + "rewards/length2tails_reward/std": 0.34021517634391785, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.410797595977783, + "rewards/thermo_reward/std": 1.0919665098190308, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.5, + "completions/mean_terminated_length": 271.5, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09479405451565981, + "epoch": 1.442, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3154640793800354, + "learning_rate": 1.6530969837532485e-06, + "loss": -0.0045, + "num_tokens": 6269030.0, + "reward": 13.004693984985352, + "reward_std": 2.8685030937194824, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.390768527984619, + "rewards/kidney_reward/std": 1.0364271402359009, + "rewards/length2tails_reward/mean": 0.6494479775428772, + "rewards/length2tails_reward/std": 0.34908527135849, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1453044414520264, + "rewards/thermo_reward/std": 1.6958801746368408, + "step": 721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.53125, + "completions/mean_terminated_length": 272.53125, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.11144114658236504, + "epoch": 1.444, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14011171460151672, + "learning_rate": 1.6521254073766e-06, + "loss": 0.0054, + "num_tokens": 6277783.0, + "reward": 12.596236228942871, + "reward_std": 2.8663110733032227, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.3510470390319824, + "rewards/kidney_reward/std": 0.8927178382873535, + "rewards/length2tails_reward/mean": 0.7744334936141968, + "rewards/length2tails_reward/std": 0.20624327659606934, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.764070510864258, + "rewards/thermo_reward/std": 1.86115562915802, + "step": 722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 272.4375, + "completions/mean_terminated_length": 272.4375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10096946451812983, + "epoch": 1.446, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.087567999958992, + "learning_rate": 1.6511527587435735e-06, + "loss": -0.0009, + "num_tokens": 6286533.0, + "reward": 12.627382278442383, + "reward_std": 3.105064630508423, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.3351755142211914, + "rewards/kidney_reward/std": 0.95881587266922, + "rewards/length2tails_reward/mean": 0.7469046115875244, + "rewards/length2tails_reward/std": 0.3319987952709198, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.813839912414551, + "rewards/thermo_reward/std": 2.0692062377929688, + "step": 723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.15625, + "completions/mean_terminated_length": 272.15625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0993009340018034, + "epoch": 1.448, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1495305299758911, + "learning_rate": 1.650179039453445e-06, + "loss": -0.0021, + "num_tokens": 6295274.0, + "reward": 11.36323070526123, + "reward_std": 6.454504013061523, + "rewards/fitness_reward/mean": 6.333192825317383, + "rewards/fitness_reward/std": 3.254973888397217, + "rewards/kidney_reward/mean": 2.0430855751037598, + "rewards/kidney_reward/std": 1.639970302581787, + "rewards/length2tails_reward/mean": 0.7403050661087036, + "rewards/length2tails_reward/std": 0.3240678608417511, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8129220008850098, + "rewards/thermo_reward/std": 2.1175599098205566, + "step": 724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.0625, + "completions/mean_terminated_length": 271.0625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09985631797462702, + "epoch": 1.45, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11432749032974243, + "learning_rate": 1.6492042511072518e-06, + "loss": 0.0021, + "num_tokens": 6303980.0, + "reward": 13.438583374023438, + "reward_std": 1.562416672706604, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.53704833984375, + "rewards/kidney_reward/std": 0.5059004426002502, + "rewards/length2tails_reward/mean": 0.7021323442459106, + "rewards/length2tails_reward/std": 0.2842758297920227, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3701372146606445, + "rewards/thermo_reward/std": 1.1033118963241577, + "step": 725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.125, + "completions/mean_terminated_length": 272.125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10494589246809483, + "epoch": 1.452, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09333200752735138, + "learning_rate": 1.6482283953077884e-06, + "loss": -0.0066, + "num_tokens": 6312720.0, + "reward": 13.020339965820312, + "reward_std": 1.564118504524231, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.7386187314987183, + "rewards/length2tails_reward/std": 0.2925185263156891, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9408910274505615, + "rewards/thermo_reward/std": 1.4223616123199463, + "step": 726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.34375, + "completions/mean_terminated_length": 272.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.100801981985569, + "epoch": 1.454, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10188101977109909, + "learning_rate": 1.647251473659604e-06, + "loss": -0.0005, + "num_tokens": 6321467.0, + "reward": 12.78017807006836, + "reward_std": 2.7691338062286377, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.399458408355713, + "rewards/kidney_reward/std": 0.7088085412979126, + "rewards/length2tails_reward/mean": 0.7669350504875183, + "rewards/length2tails_reward/std": 0.27515965700149536, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9003515243530273, + "rewards/thermo_reward/std": 1.9575797319412231, + "step": 727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0940690515562892, + "epoch": 1.456, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13961991667747498, + "learning_rate": 1.6462734877690008e-06, + "loss": -0.0085, + "num_tokens": 6330188.0, + "reward": 12.996550559997559, + "reward_std": 1.7360068559646606, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.692048192024231, + "rewards/length2tails_reward/std": 0.31852561235427856, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.894399404525757, + "rewards/thermo_reward/std": 1.5485317707061768, + "step": 728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.875, + "completions/mean_terminated_length": 271.875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10566907282918692, + "epoch": 1.458, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15134963393211365, + "learning_rate": 1.645294439244031e-06, + "loss": -0.0018, + "num_tokens": 6338920.0, + "reward": 11.719990730285645, + "reward_std": 4.794181823730469, + "rewards/fitness_reward/mean": 6.929496765136719, + "rewards/fitness_reward/std": 2.1311330795288086, + "rewards/kidney_reward/mean": 2.1761727333068848, + "rewards/kidney_reward/std": 1.1675001382827759, + "rewards/length2tails_reward/mean": 0.722203254699707, + "rewards/length2tails_reward/std": 0.30619093775749207, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.442101001739502, + "rewards/thermo_reward/std": 2.2006373405456543, + "step": 729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.34375, + "completions/mean_terminated_length": 271.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09765789099037647, + "epoch": 1.46, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.054920729249715805, + "learning_rate": 1.6443143296944945e-06, + "loss": -0.0017, + "num_tokens": 6347635.0, + "reward": 12.432905197143555, + "reward_std": 5.245761871337891, + "rewards/fitness_reward/mean": 6.691164016723633, + "rewards/fitness_reward/std": 2.6455376148223877, + "rewards/kidney_reward/mean": 2.3567051887512207, + "rewards/kidney_reward/std": 1.1295703649520874, + "rewards/length2tails_reward/mean": 0.7048357725143433, + "rewards/length2tails_reward/std": 0.3146979808807373, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2145519256591797, + "rewards/thermo_reward/std": 1.5634750127792358, + "step": 730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.75, + "completions/mean_terminated_length": 272.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09744808170944452, + "epoch": 1.462, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07000837475061417, + "learning_rate": 1.643333160731934e-06, + "loss": -0.0001, + "num_tokens": 6356395.0, + "reward": 12.698062896728516, + "reward_std": 4.5932297706604, + "rewards/fitness_reward/mean": 6.997720718383789, + "rewards/fitness_reward/std": 2.056065320968628, + "rewards/kidney_reward/mean": 2.4325098991394043, + "rewards/kidney_reward/std": 1.0972591638565063, + "rewards/length2tails_reward/mean": 0.7664821743965149, + "rewards/length2tails_reward/std": 0.30541515350341797, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.091184139251709, + "rewards/thermo_reward/std": 1.6904629468917847, + "step": 731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 270.28125, + "completions/mean_terminated_length": 270.28125, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.09817949496209621, + "epoch": 1.464, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14221732318401337, + "learning_rate": 1.6423509339696362e-06, + "loss": -0.0015, + "num_tokens": 6365076.0, + "reward": 12.266221046447754, + "reward_std": 5.32376766204834, + "rewards/fitness_reward/mean": 6.971794605255127, + "rewards/fitness_reward/std": 2.2027249336242676, + "rewards/kidney_reward/mean": 2.2509796619415283, + "rewards/kidney_reward/std": 1.3944387435913086, + "rewards/length2tails_reward/mean": 0.721784234046936, + "rewards/length2tails_reward/std": 0.307940274477005, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8712682723999023, + "rewards/thermo_reward/std": 2.0534238815307617, + "step": 732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 272.8125, + "completions/mean_terminated_length": 272.8125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1109353955835104, + "epoch": 1.466, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08689557015895844, + "learning_rate": 1.6413676510226259e-06, + "loss": -0.005, + "num_tokens": 6373838.0, + "reward": 12.710731506347656, + "reward_std": 3.480628252029419, + "rewards/fitness_reward/mean": 6.987434387207031, + "rewards/fitness_reward/std": 2.1142518520355225, + "rewards/kidney_reward/mean": 2.431445837020874, + "rewards/kidney_reward/std": 0.6755548119544983, + "rewards/length2tails_reward/mean": 0.7463986873626709, + "rewards/length2tails_reward/std": 0.3106290400028229, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.11721134185791, + "rewards/thermo_reward/std": 1.5157802104949951, + "step": 733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10329800006002188, + "epoch": 1.468, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09860417246818542, + "learning_rate": 1.640383313507665e-06, + "loss": -0.0047, + "num_tokens": 6382569.0, + "reward": 13.229984283447266, + "reward_std": 1.7875175476074219, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7161481976509094, + "rewards/length2tails_reward/std": 0.3124900758266449, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0707039833068848, + "rewards/thermo_reward/std": 1.7761015892028809, + "step": 734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.4375, + "completions/mean_terminated_length": 273.4375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.10924804303795099, + "epoch": 1.47, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12164352834224701, + "learning_rate": 1.6393979230432494e-06, + "loss": 0.003, + "num_tokens": 6391351.0, + "reward": 12.864602088928223, + "reward_std": 3.003507375717163, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.650642991065979, + "rewards/kidney_reward/mean": 2.3887598514556885, + "rewards/kidney_reward/std": 0.9033457040786743, + "rewards/length2tails_reward/mean": 0.8108813762664795, + "rewards/length2tails_reward/std": 0.2239287793636322, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0485873222351074, + "rewards/thermo_reward/std": 1.633050799369812, + "step": 735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 271.21875, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "entropy": 0.10441209375858307, + "epoch": 1.472, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20049253106117249, + "learning_rate": 1.6384114812496055e-06, + "loss": -0.017, + "num_tokens": 6400062.0, + "reward": 12.799659729003906, + "reward_std": 2.6198184490203857, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.376730442047119, + "rewards/kidney_reward/std": 0.783859133720398, + "rewards/length2tails_reward/mean": 0.7949988842010498, + "rewards/length2tails_reward/std": 0.24802890419960022, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.939753770828247, + "rewards/thermo_reward/std": 1.6938996315002441, + "step": 736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.375, + "completions/mean_terminated_length": 273.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09739785362035036, + "epoch": 1.474, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08881720155477524, + "learning_rate": 1.6374239897486897e-06, + "loss": -0.0008, + "num_tokens": 6408842.0, + "reward": 12.459671974182129, + "reward_std": 4.295812129974365, + "rewards/fitness_reward/mean": 7.014317512512207, + "rewards/fitness_reward/std": 1.9621782302856445, + "rewards/kidney_reward/mean": 2.3737847805023193, + "rewards/kidney_reward/std": 0.9848051071166992, + "rewards/length2tails_reward/mean": 0.7881827354431152, + "rewards/length2tails_reward/std": 0.2944299280643463, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.892751455307007, + "rewards/thermo_reward/std": 1.706197738647461, + "step": 737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.75, + "completions/mean_terminated_length": 272.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10733734723180532, + "epoch": 1.476, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14616692066192627, + "learning_rate": 1.6364354501641833e-06, + "loss": 0.0013, + "num_tokens": 6417602.0, + "reward": 13.177677154541016, + "reward_std": 1.8004887104034424, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4556808471679688, + "rewards/kidney_reward/std": 0.5514306426048279, + "rewards/length2tails_reward/mean": 0.7851049304008484, + "rewards/length2tails_reward/std": 0.2981264889240265, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1823010444641113, + "rewards/thermo_reward/std": 1.3276283740997314, + "step": 738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.53125, + "completions/mean_terminated_length": 272.53125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.10223050322383642, + "epoch": 1.478, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0845065712928772, + "learning_rate": 1.635445864121491e-06, + "loss": -0.0022, + "num_tokens": 6426355.0, + "reward": 13.446077346801758, + "reward_std": 0.9884455800056458, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7850735187530518, + "rewards/length2tails_reward/std": 0.2458508163690567, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2799057960510254, + "rewards/thermo_reward/std": 0.9799438714981079, + "step": 739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.10738382302224636, + "epoch": 1.48, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12464555352926254, + "learning_rate": 1.6344552332477386e-06, + "loss": 0.0049, + "num_tokens": 6435086.0, + "reward": 13.028051376342773, + "reward_std": 2.2536768913269043, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4271481037139893, + "rewards/kidney_reward/std": 0.8362053036689758, + "rewards/length2tails_reward/mean": 0.7704687714576721, + "rewards/length2tails_reward/std": 0.2345697581768036, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0626707077026367, + "rewards/thermo_reward/std": 1.5018627643585205, + "step": 740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.59375, + "completions/mean_terminated_length": 272.59375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10015272535383701, + "epoch": 1.482, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08661410212516785, + "learning_rate": 1.63346355917177e-06, + "loss": -0.0032, + "num_tokens": 6443841.0, + "reward": 12.128244400024414, + "reward_std": 5.633739948272705, + "rewards/fitness_reward/mean": 6.676786422729492, + "rewards/fitness_reward/std": 2.7070510387420654, + "rewards/kidney_reward/mean": 2.29425048828125, + "rewards/kidney_reward/std": 1.291764497756958, + "rewards/length2tails_reward/mean": 0.7829782962799072, + "rewards/length2tails_reward/std": 0.29065847396850586, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.978910207748413, + "rewards/thermo_reward/std": 2.015270948410034, + "step": 741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.3125, + "completions/mean_terminated_length": 272.3125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09847946092486382, + "epoch": 1.484, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09854874759912491, + "learning_rate": 1.6324708435241434e-06, + "loss": 0.0039, + "num_tokens": 6452587.0, + "reward": 13.673116683959961, + "reward_std": 0.554030179977417, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7548766136169434, + "rewards/length2tails_reward/std": 0.26428091526031494, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.50996470451355, + "rewards/thermo_reward/std": 0.5615194439888, + "step": 742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.34375, + "completions/mean_terminated_length": 269.34375, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "entropy": 0.09711272455751896, + "epoch": 1.486, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4529189169406891, + "learning_rate": 1.6314770879371312e-06, + "loss": -0.0275, + "num_tokens": 6461238.0, + "reward": 12.402647972106934, + "reward_std": 4.0601325035095215, + "rewards/fitness_reward/mean": 6.937617301940918, + "rewards/fitness_reward/std": 1.7827101945877075, + "rewards/kidney_reward/mean": 2.415391683578491, + "rewards/kidney_reward/std": 0.9006344676017761, + "rewards/length2tails_reward/mean": 0.692960262298584, + "rewards/length2tails_reward/std": 0.31155526638031006, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.880342483520508, + "rewards/thermo_reward/std": 1.7204630374908447, + "step": 743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.15625, + "completions/mean_terminated_length": 271.15625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0934545025229454, + "epoch": 1.488, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17958083748817444, + "learning_rate": 1.6304822940447136e-06, + "loss": -0.0003, + "num_tokens": 6469947.0, + "reward": 12.78943157196045, + "reward_std": 2.1344528198242188, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4513936042785645, + "rewards/kidney_reward/std": 0.7045504450798035, + "rewards/length2tails_reward/mean": 0.6639407873153687, + "rewards/length2tails_reward/std": 0.3690692186355591, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.810458183288574, + "rewards/thermo_reward/std": 1.6286946535110474, + "step": 744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.46875, + "completions/mean_terminated_length": 272.46875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10307030286639929, + "epoch": 1.49, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13122333586215973, + "learning_rate": 1.6294864634825802e-06, + "loss": -0.0041, + "num_tokens": 6478698.0, + "reward": 13.756085395812988, + "reward_std": 0.5152093768119812, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7868427038192749, + "rewards/length2tails_reward/std": 0.2657336890697479, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.84375, + "completions/mean_terminated_length": 273.84375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10226912703365088, + "epoch": 1.492, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12127039581537247, + "learning_rate": 1.6284895978881234e-06, + "loss": 0.0008, + "num_tokens": 6487493.0, + "reward": 13.357280731201172, + "reward_std": 1.3006410598754883, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.517043113708496, + "rewards/kidney_reward/std": 0.2941751182079315, + "rewards/length2tails_reward/mean": 0.8377007842063904, + "rewards/length2tails_reward/std": 0.2545430660247803, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2952828407287598, + "rewards/thermo_reward/std": 1.0900053977966309, + "step": 746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10751635394990444, + "epoch": 1.494, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20295307040214539, + "learning_rate": 1.6274916989004388e-06, + "loss": 0.0041, + "num_tokens": 6496241.0, + "reward": 12.564690589904785, + "reward_std": 4.798762321472168, + "rewards/fitness_reward/mean": 6.9990715980529785, + "rewards/fitness_reward/std": 2.0484225749969482, + "rewards/kidney_reward/mean": 2.416921615600586, + "rewards/kidney_reward/std": 1.185438632965088, + "rewards/length2tails_reward/mean": 0.7583932876586914, + "rewards/length2tails_reward/std": 0.2566794753074646, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.972858190536499, + "rewards/thermo_reward/std": 1.8865139484405518, + "step": 747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1033137459307909, + "epoch": 1.496, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11981922388076782, + "learning_rate": 1.6264927681603205e-06, + "loss": -0.0007, + "num_tokens": 6505005.0, + "reward": 12.776046752929688, + "reward_std": 2.2382259368896484, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.508439064025879, + "rewards/kidney_reward/std": 0.5310096144676208, + "rewards/length2tails_reward/mean": 0.774960458278656, + "rewards/length2tails_reward/std": 0.2797970771789551, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.786435842514038, + "rewards/thermo_reward/std": 1.6946086883544922, + "step": 748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.4375, + "completions/mean_terminated_length": 270.4375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09651091136038303, + "epoch": 1.498, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06410472095012665, + "learning_rate": 1.6254928073102584e-06, + "loss": -0.0056, + "num_tokens": 6513691.0, + "reward": 12.236513137817383, + "reward_std": 4.273045063018799, + "rewards/fitness_reward/mean": 6.682684898376465, + "rewards/fitness_reward/std": 2.681821346282959, + "rewards/kidney_reward/mean": 2.457012176513672, + "rewards/kidney_reward/std": 0.5447913408279419, + "rewards/length2tails_reward/mean": 0.622907280921936, + "rewards/length2tails_reward/std": 0.3190979063510895, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9345245361328125, + "rewards/thermo_reward/std": 1.5298142433166504, + "step": 749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09542472753673792, + "epoch": 1.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0805036649107933, + "learning_rate": 1.6244918179944378e-06, + "loss": -0.0053, + "num_tokens": 6522449.0, + "reward": 12.991552352905273, + "reward_std": 3.2086243629455566, + "rewards/fitness_reward/mean": 7.051550388336182, + "rewards/fitness_reward/std": 1.7515581846237183, + "rewards/kidney_reward/mean": 2.533559799194336, + "rewards/kidney_reward/std": 0.5256339907646179, + "rewards/length2tails_reward/mean": 0.7330061197280884, + "rewards/length2tails_reward/std": 0.33843839168548584, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2331414222717285, + "rewards/thermo_reward/std": 1.2927289009094238, + "step": 750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.25, + "completions/mean_terminated_length": 271.25, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09929024521261454, + "epoch": 1.502, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0803636759519577, + "learning_rate": 1.6234898018587336e-06, + "loss": -0.0071, + "num_tokens": 6531161.0, + "reward": 12.638072967529297, + "reward_std": 3.315128803253174, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.356464385986328, + "rewards/kidney_reward/std": 0.9754756689071655, + "rewards/length2tails_reward/mean": 0.6804892420768738, + "rewards/length2tails_reward/std": 0.3451850116252899, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8098840713500977, + "rewards/thermo_reward/std": 2.2203240394592285, + "step": 751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10675702150911093, + "epoch": 1.504, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12016993016004562, + "learning_rate": 1.6224867605507092e-06, + "loss": -0.003, + "num_tokens": 6539909.0, + "reward": 13.501442909240723, + "reward_std": 0.9707752466201782, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7311062812805176, + "rewards/length2tails_reward/std": 0.2865833044052124, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3680262565612793, + "rewards/thermo_reward/std": 0.9308540225028992, + "step": 752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.5, + "completions/mean_terminated_length": 271.5, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09785383474081755, + "epoch": 1.506, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13616763055324554, + "learning_rate": 1.6214826957196151e-06, + "loss": -0.0033, + "num_tokens": 6548629.0, + "reward": 13.107882499694824, + "reward_std": 2.6472818851470947, + "rewards/fitness_reward/mean": 6.991186618804932, + "rewards/fitness_reward/std": 2.093026638031006, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.7090970873832703, + "rewards/length2tails_reward/std": 0.3214232325553894, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.401383876800537, + "rewards/thermo_reward/std": 0.9524527192115784, + "step": 753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11788608599454165, + "epoch": 1.508, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.058707550168037415, + "learning_rate": 1.6204776090163826e-06, + "loss": -0.0053, + "num_tokens": 6557387.0, + "reward": 13.186639785766602, + "reward_std": 3.0022521018981934, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.5390896797180176, + "rewards/kidney_reward/std": 0.49435171484947205, + "rewards/length2tails_reward/mean": 0.7803868055343628, + "rewards/length2tails_reward/std": 0.2767482101917267, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4164583683013916, + "rewards/thermo_reward/std": 0.8831133842468262, + "step": 754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.71875, + "completions/mean_terminated_length": 272.71875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11147108674049377, + "epoch": 1.51, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2016543447971344, + "learning_rate": 1.6194715020936248e-06, + "loss": 0.0, + "num_tokens": 6566146.0, + "reward": 12.690385818481445, + "reward_std": 5.008844375610352, + "rewards/fitness_reward/mean": 6.974052429199219, + "rewards/fitness_reward/std": 2.1899514198303223, + "rewards/kidney_reward/mean": 2.2993273735046387, + "rewards/kidney_reward/std": 1.3796072006225586, + "rewards/length2tails_reward/mean": 0.793893575668335, + "rewards/length2tails_reward/std": 0.23800085484981537, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2376160621643066, + "rewards/thermo_reward/std": 1.5841246843338013, + "step": 755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.90625, + "completions/mean_terminated_length": 272.90625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10156818572431803, + "epoch": 1.512, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07396720349788666, + "learning_rate": 1.6184643766056313e-06, + "loss": -0.0046, + "num_tokens": 6574911.0, + "reward": 13.152985572814941, + "reward_std": 1.8636387586593628, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7816914916038513, + "rewards/length2tails_reward/std": 0.28421974182128906, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.041869640350342, + "rewards/thermo_reward/std": 1.6973748207092285, + "step": 756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.1875, + "completions/mean_terminated_length": 272.1875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09756794199347496, + "epoch": 1.514, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23166704177856445, + "learning_rate": 1.6174562342083676e-06, + "loss": -0.0055, + "num_tokens": 6583653.0, + "reward": 12.610737800598145, + "reward_std": 3.5867321491241455, + "rewards/fitness_reward/mean": 7.0473952293396, + "rewards/fitness_reward/std": 1.7750624418258667, + "rewards/kidney_reward/mean": 2.4483442306518555, + "rewards/kidney_reward/std": 0.588398277759552, + "rewards/length2tails_reward/mean": 0.7579518556594849, + "rewards/length2tails_reward/std": 0.2971855103969574, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9392032623291016, + "rewards/thermo_reward/std": 1.8040894269943237, + "step": 757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 273.96875, + "completions/mean_terminated_length": 273.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10081325005739927, + "epoch": 1.516, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15814757347106934, + "learning_rate": 1.6164470765594697e-06, + "loss": 0.0018, + "num_tokens": 6592452.0, + "reward": 11.030250549316406, + "reward_std": 5.795161724090576, + "rewards/fitness_reward/mean": 6.506538391113281, + "rewards/fitness_reward/std": 2.9215149879455566, + "rewards/kidney_reward/mean": 1.9202601909637451, + "rewards/kidney_reward/std": 1.4814492464065552, + "rewards/length2tails_reward/mean": 0.7626752853393555, + "rewards/length2tails_reward/std": 0.30968478322029114, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.4271841049194336, + "rewards/thermo_reward/std": 2.2704720497131348, + "step": 758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.34375, + "completions/mean_terminated_length": 271.34375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09835837502032518, + "epoch": 1.518, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1475781798362732, + "learning_rate": 1.6154369053182446e-06, + "loss": -0.0036, + "num_tokens": 6601167.0, + "reward": 12.41716194152832, + "reward_std": 3.9362692832946777, + "rewards/fitness_reward/mean": 7.052267074584961, + "rewards/fitness_reward/std": 1.7475041151046753, + "rewards/kidney_reward/mean": 2.367185115814209, + "rewards/kidney_reward/std": 0.8782473802566528, + "rewards/length2tails_reward/mean": 0.6665036082267761, + "rewards/length2tails_reward/std": 0.3518211245536804, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8310587406158447, + "rewards/thermo_reward/std": 1.8054473400115967, + "step": 759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 273.4375, + "completions/mean_terminated_length": 273.4375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.10087462235242128, + "epoch": 1.52, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15830565989017487, + "learning_rate": 1.6144257221456648e-06, + "loss": 0.0067, + "num_tokens": 6609949.0, + "reward": 12.919963836669922, + "reward_std": 2.08066987991333, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.4414336681365967, + "rewards/kidney_reward/std": 0.6237428188323975, + "rewards/length2tails_reward/mean": 0.8254392147064209, + "rewards/length2tails_reward/std": 0.20629006624221802, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.992310047149658, + "rewards/thermo_reward/std": 1.2839230298995972, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "entropy": 0.10661183949559927, + "epoch": 1.522, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07809071987867355, + "learning_rate": 1.6134135287043666e-06, + "loss": -0.0031, + "num_tokens": 6618665.0, + "reward": 13.041078567504883, + "reward_std": 1.7745665311813354, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8089865446090698, + "rewards/length2tails_reward/std": 0.2741064727306366, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.927234411239624, + "rewards/thermo_reward/std": 1.5899276733398438, + "step": 761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.875, + "completions/mean_terminated_length": 271.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09442920796573162, + "epoch": 1.524, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.060308799147605896, + "learning_rate": 1.612400326658648e-06, + "loss": -0.007, + "num_tokens": 6627397.0, + "reward": 12.39229965209961, + "reward_std": 5.122243404388428, + "rewards/fitness_reward/mean": 6.723897933959961, + "rewards/fitness_reward/std": 2.509150981903076, + "rewards/kidney_reward/mean": 2.3572373390197754, + "rewards/kidney_reward/std": 1.1268643140792847, + "rewards/length2tails_reward/mean": 0.7106964588165283, + "rewards/length2tails_reward/std": 0.32191285490989685, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1400952339172363, + "rewards/thermo_reward/std": 1.7281146049499512, + "step": 762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.15625, + "completions/mean_terminated_length": 272.15625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09344307612627745, + "epoch": 1.526, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10824421793222427, + "learning_rate": 1.6113861176744657e-06, + "loss": -0.0051, + "num_tokens": 6636138.0, + "reward": 12.915403366088867, + "reward_std": 2.598353147506714, + "rewards/fitness_reward/mean": 7.011829853057861, + "rewards/fitness_reward/std": 1.9762507677078247, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7344396114349365, + "rewards/length2tails_reward/std": 0.2898414731025696, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1583681106567383, + "rewards/thermo_reward/std": 1.276017665863037, + "step": 763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.71875, + "completions/mean_terminated_length": 271.71875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0914773028343916, + "epoch": 1.528, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07586502283811569, + "learning_rate": 1.6103709034194307e-06, + "loss": -0.005, + "num_tokens": 6644865.0, + "reward": 13.459787368774414, + "reward_std": 2.1172537803649902, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.5065970420837402, + "rewards/kidney_reward/std": 0.6781591773033142, + "rewards/length2tails_reward/mean": 0.7208532094955444, + "rewards/length2tails_reward/std": 0.2844887971878052, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.477428913116455, + "rewards/thermo_reward/std": 1.1407392024993896, + "step": 764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 264.71875, + "completions/mean_terminated_length": 264.71875, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.10680115036666393, + "epoch": 1.53, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.43259674310684204, + "learning_rate": 1.6093546855628081e-06, + "loss": -0.0975, + "num_tokens": 6653368.0, + "reward": 12.1873779296875, + "reward_std": 6.2827935218811035, + "rewards/fitness_reward/mean": 6.652431488037109, + "rewards/fitness_reward/std": 2.7991580963134766, + "rewards/kidney_reward/mean": 2.2365856170654297, + "rewards/kidney_reward/std": 1.5544345378875732, + "rewards/length2tails_reward/mean": 0.696941077709198, + "rewards/length2tails_reward/std": 0.3050379753112793, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.128666639328003, + "rewards/thermo_reward/std": 1.9798851013183594, + "step": 765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.625, + "completions/mean_terminated_length": 273.625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.10760107543319464, + "epoch": 1.532, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12129205465316772, + "learning_rate": 1.608337465775513e-06, + "loss": 0.0028, + "num_tokens": 6662156.0, + "reward": 13.722698211669922, + "reward_std": 0.5326492190361023, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8518332242965698, + "rewards/length2tails_reward/std": 0.18499179184436798, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5498504638671875, + "rewards/thermo_reward/std": 0.5360844731330872, + "step": 766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.8125, + "completions/mean_terminated_length": 272.8125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10517558269202709, + "epoch": 1.534, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11031178385019302, + "learning_rate": 1.6073192457301078e-06, + "loss": -0.0063, + "num_tokens": 6670918.0, + "reward": 13.046195983886719, + "reward_std": 2.4702224731445312, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.4882965087890625, + "rewards/kidney_reward/std": 0.7816820740699768, + "rewards/length2tails_reward/mean": 0.7864515781402588, + "rewards/length2tails_reward/std": 0.28928881883621216, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0755791664123535, + "rewards/thermo_reward/std": 1.6299465894699097, + "step": 767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.9375, + "completions/mean_terminated_length": 271.9375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09968086518347263, + "epoch": 1.536, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11324171721935272, + "learning_rate": 1.606300027100799e-06, + "loss": -0.0002, + "num_tokens": 6679652.0, + "reward": 12.413396835327148, + "reward_std": 4.348886966705322, + "rewards/fitness_reward/mean": 7.004317283630371, + "rewards/fitness_reward/std": 2.018749713897705, + "rewards/kidney_reward/mean": 2.41861629486084, + "rewards/kidney_reward/std": 0.8829330205917358, + "rewards/length2tails_reward/mean": 0.7344741821289062, + "rewards/length2tails_reward/std": 0.29354095458984375, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8170166015625, + "rewards/thermo_reward/std": 1.8177590370178223, + "step": 768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.125, + "completions/mean_terminated_length": 272.125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09980903100222349, + "epoch": 1.538, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07742294669151306, + "learning_rate": 1.6052798115634362e-06, + "loss": -0.0049, + "num_tokens": 6688392.0, + "reward": 13.258471488952637, + "reward_std": 1.6307097673416138, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7377700805664062, + "rewards/length2tails_reward/std": 0.2975505292415619, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.151747941970825, + "rewards/thermo_reward/std": 1.4797722101211548, + "step": 769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.1875, + "completions/mean_terminated_length": 271.1875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09326224122196436, + "epoch": 1.54, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06254822760820389, + "learning_rate": 1.6042586007955073e-06, + "loss": -0.0068, + "num_tokens": 6697102.0, + "reward": 12.24995231628418, + "reward_std": 4.322062969207764, + "rewards/fitness_reward/mean": 6.743242263793945, + "rewards/fitness_reward/std": 2.4315876960754395, + "rewards/kidney_reward/mean": 2.3956949710845947, + "rewards/kidney_reward/std": 0.7116798758506775, + "rewards/length2tails_reward/mean": 0.6738260984420776, + "rewards/length2tails_reward/std": 0.3288188576698303, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9436330795288086, + "rewards/thermo_reward/std": 1.6477420330047607, + "step": 770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.10175461042672396, + "epoch": 1.542, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11549469828605652, + "learning_rate": 1.6032363964761361e-06, + "loss": -0.0004, + "num_tokens": 6705833.0, + "reward": 13.348580360412598, + "reward_std": 1.3180879354476929, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.7467554211616516, + "rewards/length2tails_reward/std": 0.2537044584751129, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.268317222595215, + "rewards/thermo_reward/std": 1.1638838052749634, + "step": 771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 273.375, + "completions/mean_terminated_length": 273.375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.10368968173861504, + "epoch": 1.544, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0708426684141159, + "learning_rate": 1.6022132002860821e-06, + "loss": -0.0049, + "num_tokens": 6714613.0, + "reward": 13.347894668579102, + "reward_std": 1.6074857711791992, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8183344602584839, + "rewards/length2tails_reward/std": 0.23631368577480316, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2331151962280273, + "rewards/thermo_reward/std": 1.455021858215332, + "step": 772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.40625, + "completions/mean_terminated_length": 271.40625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09773553442209959, + "epoch": 1.546, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18035611510276794, + "learning_rate": 1.6011890139077333e-06, + "loss": -0.0003, + "num_tokens": 6723330.0, + "reward": 12.25750732421875, + "reward_std": 5.029834270477295, + "rewards/fitness_reward/mean": 6.723915100097656, + "rewards/fitness_reward/std": 2.5090818405151367, + "rewards/kidney_reward/mean": 2.3070859909057617, + "rewards/kidney_reward/std": 1.111136794090271, + "rewards/length2tails_reward/mean": 0.6995996832847595, + "rewards/length2tails_reward/std": 0.32060977816581726, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0565457344055176, + "rewards/thermo_reward/std": 1.7153652906417847, + "step": 773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.5625, + "completions/mean_terminated_length": 272.5625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10099483374506235, + "epoch": 1.548, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3348585367202759, + "learning_rate": 1.6001638390251073e-06, + "loss": 0.0036, + "num_tokens": 6732084.0, + "reward": 12.60731315612793, + "reward_std": 4.337095260620117, + "rewards/fitness_reward/mean": 7.034456729888916, + "rewards/fitness_reward/std": 1.8482544422149658, + "rewards/kidney_reward/mean": 2.3530704975128174, + "rewards/kidney_reward/std": 1.0146421194076538, + "rewards/length2tails_reward/mean": 0.7690781950950623, + "rewards/length2tails_reward/std": 0.27084881067276, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.042877674102783, + "rewards/thermo_reward/std": 1.8710243701934814, + "step": 774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.03125, + "completions/mean_terminated_length": 272.03125, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.10179403517395258, + "epoch": 1.55, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1293950229883194, + "learning_rate": 1.5991376773238466e-06, + "loss": 0.0014, + "num_tokens": 6740821.0, + "reward": 12.269551277160645, + "reward_std": 5.643449306488037, + "rewards/fitness_reward/mean": 6.707425594329834, + "rewards/fitness_reward/std": 2.57702374458313, + "rewards/kidney_reward/mean": 2.304417848587036, + "rewards/kidney_reward/std": 1.4015402793884277, + "rewards/length2tails_reward/mean": 0.8244496583938599, + "rewards/length2tails_reward/std": 0.2605707049369812, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.075263023376465, + "rewards/thermo_reward/std": 1.8867286443710327, + "step": 775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.46875, + "completions/mean_terminated_length": 271.46875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09344108309596777, + "epoch": 1.552, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1531888246536255, + "learning_rate": 1.5981105304912159e-06, + "loss": -0.0081, + "num_tokens": 6749540.0, + "reward": 12.162508964538574, + "reward_std": 3.732130527496338, + "rewards/fitness_reward/mean": 6.8792243003845215, + "rewards/fitness_reward/std": 1.8029879331588745, + "rewards/kidney_reward/mean": 2.2372934818267822, + "rewards/kidney_reward/std": 1.1890099048614502, + "rewards/length2tails_reward/mean": 0.6980095505714417, + "rewards/length2tails_reward/std": 0.33210060000419617, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.876189947128296, + "rewards/thermo_reward/std": 1.869378685951233, + "step": 776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.96875, + "completions/mean_terminated_length": 272.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0962613089941442, + "epoch": 1.554, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07587003707885742, + "learning_rate": 1.5970824002161006e-06, + "loss": -0.0084, + "num_tokens": 6758307.0, + "reward": 13.13638687133789, + "reward_std": 2.2762598991394043, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.4853367805480957, + "rewards/kidney_reward/std": 0.6571304202079773, + "rewards/length2tails_reward/mean": 0.7716554999351501, + "rewards/length2tails_reward/std": 0.2985219657421112, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1702094078063965, + "rewards/thermo_reward/std": 1.5183756351470947, + "step": 777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.34375, + "completions/mean_terminated_length": 272.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10898962616920471, + "epoch": 1.556, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.127036452293396, + "learning_rate": 1.5960532881890023e-06, + "loss": 0.0081, + "num_tokens": 6767054.0, + "reward": 13.312883377075195, + "reward_std": 0.7643176913261414, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.7653322815895081, + "rewards/length2tails_reward/std": 0.28019002079963684, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2307629585266113, + "rewards/thermo_reward/std": 0.6471191644668579, + "step": 778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.03125, + "completions/mean_terminated_length": 272.03125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10188727173954248, + "epoch": 1.558, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1719304472208023, + "learning_rate": 1.595023196102037e-06, + "loss": 0.009, + "num_tokens": 6775791.0, + "reward": 13.142478942871094, + "reward_std": 2.2195565700531006, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4213266372680664, + "rewards/kidney_reward/std": 0.7287588119506836, + "rewards/length2tails_reward/mean": 0.7504162788391113, + "rewards/length2tails_reward/std": 0.2720203995704651, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1849265098571777, + "rewards/thermo_reward/std": 1.569002389907837, + "step": 779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.09375, + "completions/mean_terminated_length": 271.09375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08891648892313242, + "epoch": 1.56, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4685585498809814, + "learning_rate": 1.5939921256489327e-06, + "loss": -0.0054, + "num_tokens": 6784498.0, + "reward": 13.171124458312988, + "reward_std": 2.093531370162964, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.5107901096343994, + "rewards/kidney_reward/std": 0.5183009505271912, + "rewards/length2tails_reward/mean": 0.7029236555099487, + "rewards/length2tails_reward/std": 0.30575695633888245, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.186366081237793, + "rewards/thermo_reward/std": 1.5517044067382812, + "step": 780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.0, + "completions/mean_terminated_length": 271.0, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.10196580365300179, + "epoch": 1.562, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08939795941114426, + "learning_rate": 1.5929600785250256e-06, + "loss": -0.0054, + "num_tokens": 6793202.0, + "reward": 13.353775978088379, + "reward_std": 1.6284438371658325, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7209261655807495, + "rewards/length2tails_reward/std": 0.29964157938957214, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2213778495788574, + "rewards/thermo_reward/std": 1.497393250465393, + "step": 781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.90625, + "completions/mean_terminated_length": 271.90625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11122054141014814, + "epoch": 1.564, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10926977545022964, + "learning_rate": 1.591927056427258e-06, + "loss": -0.0009, + "num_tokens": 6801935.0, + "reward": 12.359277725219727, + "reward_std": 3.305664300918579, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.3020734786987305, + "rewards/kidney_reward/std": 0.9948132038116455, + "rewards/length2tails_reward/mean": 0.7510539293289185, + "rewards/length2tails_reward/std": 0.2921181321144104, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.578423500061035, + "rewards/thermo_reward/std": 2.3195579051971436, + "step": 782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 272.125, + "completions/mean_terminated_length": 272.125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10621493496000767, + "epoch": 1.5659999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07537518441677094, + "learning_rate": 1.5908930610541745e-06, + "loss": -0.012, + "num_tokens": 6810675.0, + "reward": 11.830780029296875, + "reward_std": 4.0461931228637695, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.144914388656616, + "rewards/kidney_reward/std": 1.109388828277588, + "rewards/length2tails_reward/mean": 0.7219532132148743, + "rewards/length2tails_reward/std": 0.37782686948776245, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.4606173038482666, + "rewards/thermo_reward/std": 2.188671112060547, + "step": 783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 273.25, + "completions/mean_terminated_length": 273.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11602041311562061, + "epoch": 1.568, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24273134768009186, + "learning_rate": 1.5898580941059217e-06, + "loss": 0.0094, + "num_tokens": 6819451.0, + "reward": 13.579347610473633, + "reward_std": 1.249585747718811, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7413228750228882, + "rewards/length2tails_reward/std": 0.3023105263710022, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4449102878570557, + "rewards/thermo_reward/std": 1.1124043464660645, + "step": 784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.9375, + "completions/mean_terminated_length": 273.9375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10096332058310509, + "epoch": 1.5699999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08716148883104324, + "learning_rate": 1.588822157284242e-06, + "loss": -0.005, + "num_tokens": 6828249.0, + "reward": 13.189170837402344, + "reward_std": 1.5594357252120972, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8377348184585571, + "rewards/length2tails_reward/std": 0.26007550954818726, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.072450637817383, + "rewards/thermo_reward/std": 1.5148123502731323, + "step": 785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10304850526154041, + "epoch": 1.572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16281567513942719, + "learning_rate": 1.587785252292473e-06, + "loss": 0.0046, + "num_tokens": 6836993.0, + "reward": 13.08975601196289, + "reward_std": 2.4612064361572266, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.4109854698181152, + "rewards/kidney_reward/std": 0.848101019859314, + "rewards/length2tails_reward/mean": 0.7908504605293274, + "rewards/length2tails_reward/std": 0.23914951086044312, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1960091590881348, + "rewards/thermo_reward/std": 1.4220669269561768, + "step": 786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.10091214813292027, + "epoch": 1.5739999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1641678363084793, + "learning_rate": 1.5867473808355452e-06, + "loss": 0.001, + "num_tokens": 6845737.0, + "reward": 13.60064697265625, + "reward_std": 1.1486788988113403, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7576706409454346, + "rewards/length2tails_reward/std": 0.2726164758205414, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4645745754241943, + "rewards/thermo_reward/std": 1.0126454830169678, + "step": 787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.46875, + "completions/mean_terminated_length": 272.46875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10151567868888378, + "epoch": 1.576, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16830457746982574, + "learning_rate": 1.5857085446199769e-06, + "loss": 0.003, + "num_tokens": 6854488.0, + "reward": 13.096981048583984, + "reward_std": 1.9139457941055298, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.5303173065185547, + "rewards/kidney_reward/std": 0.5439756512641907, + "rewards/length2tails_reward/mean": 0.7250336408615112, + "rewards/length2tails_reward/std": 0.33325880765914917, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0904839038848877, + "rewards/thermo_reward/std": 1.2277387380599976, + "step": 788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 274.21875, + "completions/mean_terminated_length": 274.21875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.10681565944105387, + "epoch": 1.5779999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0950051099061966, + "learning_rate": 1.5846687453538735e-06, + "loss": -0.0039, + "num_tokens": 6863295.0, + "reward": 12.962221145629883, + "reward_std": 2.356898784637451, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4387731552124023, + "rewards/kidney_reward/std": 0.6374660134315491, + "rewards/length2tails_reward/mean": 0.8561804294586182, + "rewards/length2tails_reward/std": 0.1828988492488861, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9766440391540527, + "rewards/thermo_reward/std": 1.8269686698913574, + "step": 789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.59375, + "completions/mean_terminated_length": 271.59375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09641438629478216, + "epoch": 1.58, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2553471326828003, + "learning_rate": 1.5836279847469234e-06, + "loss": 0.0014, + "num_tokens": 6872018.0, + "reward": 12.10179615020752, + "reward_std": 5.340146541595459, + "rewards/fitness_reward/mean": 6.624960422515869, + "rewards/fitness_reward/std": 2.897318124771118, + "rewards/kidney_reward/mean": 2.352849006652832, + "rewards/kidney_reward/std": 1.0995126962661743, + "rewards/length2tails_reward/mean": 0.7194082736968994, + "rewards/length2tails_reward/std": 0.30610015988349915, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9520459175109863, + "rewards/thermo_reward/std": 1.7063323259353638, + "step": 790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09921843744814396, + "epoch": 1.5819999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09495562314987183, + "learning_rate": 1.582586264510396e-06, + "loss": -0.0037, + "num_tokens": 6880782.0, + "reward": 12.491376876831055, + "reward_std": 3.032684326171875, + "rewards/fitness_reward/mean": 7.188657283782959, + "rewards/fitness_reward/std": 0.7179933190345764, + "rewards/kidney_reward/mean": 2.3157799243927, + "rewards/kidney_reward/std": 0.9158964157104492, + "rewards/length2tails_reward/mean": 0.7796016335487366, + "rewards/length2tails_reward/std": 0.290948748588562, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8089795112609863, + "rewards/thermo_reward/std": 1.6366848945617676, + "step": 791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.0625, + "completions/mean_terminated_length": 272.0625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10440836194902658, + "epoch": 1.584, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11112482845783234, + "learning_rate": 1.5815435863571387e-06, + "loss": -0.0043, + "num_tokens": 6889520.0, + "reward": 12.341741561889648, + "reward_std": 3.5781798362731934, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.374967098236084, + "rewards/kidney_reward/std": 0.794853687286377, + "rewards/length2tails_reward/mean": 0.7389723062515259, + "rewards/length2tails_reward/std": 0.2798449695110321, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.739823341369629, + "rewards/thermo_reward/std": 1.8928732872009277, + "step": 792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.1875, + "completions/mean_terminated_length": 272.1875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1030658707022667, + "epoch": 1.5859999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10627459734678268, + "learning_rate": 1.5804999520015733e-06, + "loss": -0.0052, + "num_tokens": 6898262.0, + "reward": 13.374687194824219, + "reward_std": 2.236538887023926, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.4553585052490234, + "rewards/kidney_reward/std": 0.8229176998138428, + "rewards/length2tails_reward/mean": 0.72886723279953, + "rewards/length2tails_reward/std": 0.32557907700538635, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.442765951156616, + "rewards/thermo_reward/std": 1.1234129667282104, + "step": 793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.34375, + "completions/mean_terminated_length": 271.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10048514977097511, + "epoch": 1.588, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05569272115826607, + "learning_rate": 1.579455363159695e-06, + "loss": -0.0057, + "num_tokens": 6906977.0, + "reward": 13.202973365783691, + "reward_std": 1.7612926959991455, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7030308842658997, + "rewards/length2tails_reward/std": 0.3247583210468292, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.099724292755127, + "rewards/thermo_reward/std": 1.5753053426742554, + "step": 794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 272.625, + "completions/mean_terminated_length": 272.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10165113396942616, + "epoch": 1.5899999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14155659079551697, + "learning_rate": 1.5784098215490666e-06, + "loss": -0.0037, + "num_tokens": 6915733.0, + "reward": 13.36789321899414, + "reward_std": 1.1217387914657593, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7168630361557007, + "rewards/length2tails_reward/std": 0.3124147355556488, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2632603645324707, + "rewards/thermo_reward/std": 1.056195616722107, + "step": 795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.28125, + "completions/mean_terminated_length": 272.28125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10210881475359201, + "epoch": 1.592, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08840259909629822, + "learning_rate": 1.5773633288888195e-06, + "loss": 0.001, + "num_tokens": 6924478.0, + "reward": 13.514481544494629, + "reward_std": 1.4057046175003052, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.536932945251465, + "rewards/kidney_reward/std": 0.5065523982048035, + "rewards/length2tails_reward/mean": 0.7745459079742432, + "rewards/length2tails_reward/std": 0.2727997303009033, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4389090538024902, + "rewards/thermo_reward/std": 0.9537463784217834, + "step": 796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09803248103708029, + "epoch": 1.5939999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09315460920333862, + "learning_rate": 1.5763158868996475e-06, + "loss": -0.0068, + "num_tokens": 6933222.0, + "reward": 12.848220825195312, + "reward_std": 2.481563091278076, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.448021173477173, + "rewards/kidney_reward/std": 0.590040922164917, + "rewards/length2tails_reward/mean": 0.7305286526679993, + "rewards/length2tails_reward/std": 0.3285638391971588, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.923471450805664, + "rewards/thermo_reward/std": 1.7929234504699707, + "step": 797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09597568307071924, + "epoch": 1.596, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06843114644289017, + "learning_rate": 1.5752674973038059e-06, + "loss": 0.0002, + "num_tokens": 6941953.0, + "reward": 13.095829010009766, + "reward_std": 1.8167020082473755, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4896838665008545, + "rewards/kidney_reward/std": 0.3229711949825287, + "rewards/length2tails_reward/mean": 0.7349690198898315, + "rewards/length2tails_reward/std": 0.28378915786743164, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.071463108062744, + "rewards/thermo_reward/std": 1.5440276861190796, + "step": 798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.90625, + "completions/mean_terminated_length": 270.90625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1026212815195322, + "epoch": 1.5979999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1733075976371765, + "learning_rate": 1.574218161825108e-06, + "loss": -0.0014, + "num_tokens": 6950654.0, + "reward": 11.799349784851074, + "reward_std": 5.825907230377197, + "rewards/fitness_reward/mean": 6.423315048217773, + "rewards/fitness_reward/std": 2.9632887840270996, + "rewards/kidney_reward/mean": 2.2777504920959473, + "rewards/kidney_reward/std": 1.1740972995758057, + "rewards/length2tails_reward/mean": 0.6755983829498291, + "rewards/length2tails_reward/std": 0.2964775860309601, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 2.93697452545166, + "rewards/thermo_reward/std": 1.9402716159820557, + "step": 799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.6875, + "completions/mean_terminated_length": 271.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10204800125211477, + "epoch": 1.6, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14254425466060638, + "learning_rate": 1.5731678821889222e-06, + "loss": 0.0027, + "num_tokens": 6959380.0, + "reward": 13.141883850097656, + "reward_std": 1.9513304233551025, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.411172389984131, + "rewards/kidney_reward/std": 0.649071216583252, + "rewards/length2tails_reward/mean": 0.6890884637832642, + "rewards/length2tails_reward/std": 0.3426918089389801, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.258126735687256, + "rewards/thermo_reward/std": 1.080284595489502, + "step": 800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 271.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09764991933479905, + "epoch": 1.6019999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1123654767870903, + "learning_rate": 1.5721166601221695e-06, + "loss": 0.0052, + "num_tokens": 6968108.0, + "reward": 13.41652774810791, + "reward_std": 2.3344967365264893, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.5197842121124268, + "rewards/kidney_reward/std": 0.6035600304603577, + "rewards/length2tails_reward/mean": 0.7101686000823975, + "rewards/length2tails_reward/std": 0.32164332270622253, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4220504760742188, + "rewards/thermo_reward/std": 1.4359506368637085, + "step": 801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.0, + "completions/mean_terminated_length": 272.0, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09724361402913928, + "epoch": 1.604, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05826536566019058, + "learning_rate": 1.5710644973533207e-06, + "loss": -0.002, + "num_tokens": 6976844.0, + "reward": 13.142796516418457, + "reward_std": 2.4546236991882324, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4166100025177, + "rewards/kidney_reward/std": 0.8939437866210938, + "rewards/length2tails_reward/mean": 0.7290231585502625, + "rewards/length2tails_reward/std": 0.33181530237197876, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.192098617553711, + "rewards/thermo_reward/std": 1.6234620809555054, + "step": 802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09845475014299154, + "epoch": 1.6059999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11397599428892136, + "learning_rate": 1.570011395612393e-06, + "loss": -0.0044, + "num_tokens": 6985565.0, + "reward": 13.158821105957031, + "reward_std": 2.5375492572784424, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.423973560333252, + "rewards/kidney_reward/std": 0.7991431355476379, + "rewards/length2tails_reward/mean": 0.7127482891082764, + "rewards/length2tails_reward/std": 0.31570836901664734, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2598977088928223, + "rewards/thermo_reward/std": 1.5675069093704224, + "step": 803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.5625, + "completions/mean_terminated_length": 272.5625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09938731510192156, + "epoch": 1.608, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13295812904834747, + "learning_rate": 1.568957356630947e-06, + "loss": -0.0022, + "num_tokens": 6994319.0, + "reward": 13.225662231445312, + "reward_std": 2.6128029823303223, + "rewards/fitness_reward/mean": 6.987685203552246, + "rewards/fitness_reward/std": 2.1128344535827637, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7760343551635742, + "rewards/length2tails_reward/std": 0.284006267786026, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.488612651824951, + "rewards/thermo_reward/std": 0.8943835496902466, + "step": 804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.6875, + "completions/mean_terminated_length": 271.6875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.11059173010289669, + "epoch": 1.6099999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10347039252519608, + "learning_rate": 1.567902382142086e-06, + "loss": -0.0026, + "num_tokens": 7003045.0, + "reward": 12.732093811035156, + "reward_std": 2.605804204940796, + "rewards/fitness_reward/mean": 7.188657283782959, + "rewards/fitness_reward/std": 0.7179933190345764, + "rewards/kidney_reward/mean": 2.3838021755218506, + "rewards/kidney_reward/std": 0.7902752757072449, + "rewards/length2tails_reward/mean": 0.7132542729377747, + "rewards/length2tails_reward/std": 0.2912064790725708, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.988309621810913, + "rewards/thermo_reward/std": 1.5316686630249023, + "step": 805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.78125, + "completions/mean_terminated_length": 271.78125, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "entropy": 0.09359847661107779, + "epoch": 1.612, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0684969425201416, + "learning_rate": 1.56684647388045e-06, + "loss": -0.0072, + "num_tokens": 7011774.0, + "reward": 13.25399398803711, + "reward_std": 1.4808253049850464, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4519968032836914, + "rewards/kidney_reward/std": 0.8416019678115845, + "rewards/length2tails_reward/mean": 0.7692837715148926, + "rewards/length2tails_reward/std": 0.29183003306388855, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2638840675354004, + "rewards/thermo_reward/std": 1.1789251565933228, + "step": 806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09691243059933186, + "epoch": 1.6139999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06483133137226105, + "learning_rate": 1.5657896335822145e-06, + "loss": -0.0044, + "num_tokens": 7020518.0, + "reward": 13.43472671508789, + "reward_std": 1.8454060554504395, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5281412601470947, + "rewards/kidney_reward/std": 0.556286096572876, + "rewards/length2tails_reward/mean": 0.7682965993881226, + "rewards/length2tails_reward/std": 0.25869956612586975, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3685708045959473, + "rewards/thermo_reward/std": 1.308498501777649, + "step": 807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09843306802213192, + "epoch": 1.616, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09614210575819016, + "learning_rate": 1.5647318629850883e-06, + "loss": -0.0024, + "num_tokens": 7029262.0, + "reward": 12.651477813720703, + "reward_std": 3.146268606185913, + "rewards/fitness_reward/mean": 6.980320930480957, + "rewards/fitness_reward/std": 2.154494524002075, + "rewards/kidney_reward/mean": 2.462048053741455, + "rewards/kidney_reward/std": 0.6474180817604065, + "rewards/length2tails_reward/mean": 0.6856108903884888, + "rewards/length2tails_reward/std": 0.3616497218608856, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0405473709106445, + "rewards/thermo_reward/std": 1.6127054691314697, + "step": 808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09584531467407942, + "epoch": 1.6179999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08554235845804214, + "learning_rate": 1.563673163828309e-06, + "loss": -0.0034, + "num_tokens": 7038026.0, + "reward": 13.498817443847656, + "reward_std": 1.6057590246200562, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.491739273071289, + "rewards/kidney_reward/std": 0.621989369392395, + "rewards/length2tails_reward/mean": 0.8107680082321167, + "rewards/length2tails_reward/std": 0.22752250730991364, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.464815378189087, + "rewards/thermo_reward/std": 1.0114378929138184, + "step": 809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.125, + "completions/mean_terminated_length": 272.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09629400447010994, + "epoch": 1.62, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1261540800333023, + "learning_rate": 1.5626135378526417e-06, + "loss": 0.0003, + "num_tokens": 7046766.0, + "reward": 12.858604431152344, + "reward_std": 4.045932769775391, + "rewards/fitness_reward/mean": 7.010948657989502, + "rewards/fitness_reward/std": 1.9812366962432861, + "rewards/kidney_reward/mean": 2.477144718170166, + "rewards/kidney_reward/std": 0.8447661399841309, + "rewards/length2tails_reward/mean": 0.7198797464370728, + "rewards/length2tails_reward/std": 0.3227717876434326, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.19852352142334, + "rewards/thermo_reward/std": 1.4419629573822021, + "step": 810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.625, + "completions/mean_terminated_length": 272.625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.11237006168812513, + "epoch": 1.6219999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2420001029968262, + "learning_rate": 1.5615529868003747e-06, + "loss": -0.0043, + "num_tokens": 7055522.0, + "reward": 12.745379447937012, + "reward_std": 2.7641563415527344, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.388613224029541, + "rewards/kidney_reward/std": 0.904141366481781, + "rewards/length2tails_reward/mean": 0.7696863412857056, + "rewards/length2tails_reward/std": 0.28826603293418884, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.876121997833252, + "rewards/thermo_reward/std": 1.8102315664291382, + "step": 811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 276.03125, + "completions/mean_terminated_length": 276.03125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.10469907149672508, + "epoch": 1.624, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1210826188325882, + "learning_rate": 1.5604915124153179e-06, + "loss": -0.0015, + "num_tokens": 7064387.0, + "reward": 13.120941162109375, + "reward_std": 1.6486657857894897, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.517043113708496, + "rewards/kidney_reward/std": 0.2941751182079315, + "rewards/length2tails_reward/mean": 0.883176326751709, + "rewards/length2tails_reward/std": 0.18250958621501923, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0543954372406006, + "rewards/thermo_reward/std": 1.4460625648498535, + "step": 812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.5, + "completions/mean_terminated_length": 273.5, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09974722750484943, + "epoch": 1.626, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10284367948770523, + "learning_rate": 1.5594291164427996e-06, + "loss": -0.0039, + "num_tokens": 7073171.0, + "reward": 12.768733978271484, + "reward_std": 3.78942608833313, + "rewards/fitness_reward/mean": 7.052046775817871, + "rewards/fitness_reward/std": 1.7487484216690063, + "rewards/kidney_reward/mean": 2.329763889312744, + "rewards/kidney_reward/std": 1.1252422332763672, + "rewards/length2tails_reward/mean": 0.8165687918663025, + "rewards/length2tails_reward/std": 0.2892093062400818, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.205265522003174, + "rewards/thermo_reward/std": 1.6000819206237793, + "step": 813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 273.0625, + "completions/mean_terminated_length": 273.0625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10471698269248009, + "epoch": 1.6280000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13619332015514374, + "learning_rate": 1.5583658006296623e-06, + "loss": -0.0018, + "num_tokens": 7081941.0, + "reward": 12.858470916748047, + "reward_std": 2.571259021759033, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4575161933898926, + "rewards/kidney_reward/std": 0.8109323978424072, + "rewards/length2tails_reward/mean": 0.7841614484786987, + "rewards/length2tails_reward/std": 0.300397127866745, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.861354112625122, + "rewards/thermo_reward/std": 1.9132797718048096, + "step": 814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.96875, + "completions/mean_terminated_length": 272.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1012994833290577, + "epoch": 1.63, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11487077176570892, + "learning_rate": 1.5573015667242624e-06, + "loss": 0.0013, + "num_tokens": 7090708.0, + "reward": 13.450803756713867, + "reward_std": 1.1194764375686646, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7772794365882874, + "rewards/length2tails_reward/std": 0.2860060930252075, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3401296138763428, + "rewards/thermo_reward/std": 1.0613343715667725, + "step": 815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.78125, + "completions/mean_terminated_length": 270.78125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09142725821584463, + "epoch": 1.6320000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11149288713932037, + "learning_rate": 1.5562364164764647e-06, + "loss": 0.0033, + "num_tokens": 7099405.0, + "reward": 12.100950241088867, + "reward_std": 5.697356224060059, + "rewards/fitness_reward/mean": 6.847840309143066, + "rewards/fitness_reward/std": 2.3250811100006104, + "rewards/kidney_reward/mean": 2.149076461791992, + "rewards/kidney_reward/std": 1.5661954879760742, + "rewards/length2tails_reward/mean": 0.6195429563522339, + "rewards/length2tails_reward/std": 0.38124367594718933, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9420790672302246, + "rewards/thermo_reward/std": 2.0163540840148926, + "step": 816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.46875, + "completions/mean_terminated_length": 271.46875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0980948880314827, + "epoch": 1.634, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08849485963582993, + "learning_rate": 1.5551703516376416e-06, + "loss": -0.0062, + "num_tokens": 7108124.0, + "reward": 12.822610855102539, + "reward_std": 2.8037033081054688, + "rewards/fitness_reward/mean": 6.9903459548950195, + "rewards/fitness_reward/std": 1.7917176485061646, + "rewards/kidney_reward/mean": 2.4557790756225586, + "rewards/kidney_reward/std": 0.5509402751922607, + "rewards/length2tails_reward/mean": 0.7287914752960205, + "rewards/length2tails_reward/std": 0.28611382842063904, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.203606128692627, + "rewards/thermo_reward/std": 1.2543768882751465, + "step": 817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.71875, + "completions/mean_terminated_length": 271.71875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09814401809126139, + "epoch": 1.6360000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.626882791519165, + "learning_rate": 1.554103373960668e-06, + "loss": -0.0043, + "num_tokens": 7116851.0, + "reward": 11.834861755371094, + "reward_std": 4.401232719421387, + "rewards/fitness_reward/mean": 6.920083045959473, + "rewards/fitness_reward/std": 1.9360997676849365, + "rewards/kidney_reward/mean": 2.083052635192871, + "rewards/kidney_reward/std": 1.3474140167236328, + "rewards/length2tails_reward/mean": 0.7249131202697754, + "rewards/length2tails_reward/std": 0.3437511622905731, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.659235954284668, + "rewards/thermo_reward/std": 2.278928756713867, + "step": 818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.5625, + "completions/mean_terminated_length": 273.5625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09511575661599636, + "epoch": 1.638, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06473495066165924, + "learning_rate": 1.5530354851999214e-06, + "loss": -0.0049, + "num_tokens": 7125637.0, + "reward": 12.665140151977539, + "reward_std": 3.4468960762023926, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.422769546508789, + "rewards/kidney_reward/std": 0.7039707899093628, + "rewards/length2tails_reward/mean": 0.8116496801376343, + "rewards/length2tails_reward/std": 0.2855256497859955, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0081522464752197, + "rewards/thermo_reward/std": 1.7059727907180786, + "step": 819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.59375, + "completions/mean_terminated_length": 270.59375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08755200356245041, + "epoch": 1.6400000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07360640913248062, + "learning_rate": 1.5519666871112763e-06, + "loss": -0.0051, + "num_tokens": 7134328.0, + "reward": 13.005146026611328, + "reward_std": 2.5275611877441406, + "rewards/fitness_reward/mean": 7.009998321533203, + "rewards/fitness_reward/std": 1.9866119623184204, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.6213452816009521, + "rewards/length2tails_reward/std": 0.3339705765247345, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.261251449584961, + "rewards/thermo_reward/std": 1.0655933618545532, + "step": 820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.21875, + "completions/mean_terminated_length": 272.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10028571262955666, + "epoch": 1.642, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07514243572950363, + "learning_rate": 1.5508969814521024e-06, + "loss": -0.0069, + "num_tokens": 7143071.0, + "reward": 13.104477882385254, + "reward_std": 3.2510416507720947, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.511730670928955, + "rewards/kidney_reward/std": 0.5132253766059875, + "rewards/length2tails_reward/mean": 0.7408324480056763, + "rewards/length2tails_reward/std": 0.2940310835838318, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.365610361099243, + "rewards/thermo_reward/std": 1.4092084169387817, + "step": 821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.40625, + "completions/mean_terminated_length": 270.40625, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "entropy": 0.09334253240376711, + "epoch": 1.6440000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3840697705745697, + "learning_rate": 1.5498263699812623e-06, + "loss": -0.0057, + "num_tokens": 7151756.0, + "reward": 12.753169059753418, + "reward_std": 3.204993963241577, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.2599570751190186, + "rewards/kidney_reward/std": 1.1070548295974731, + "rewards/length2tails_reward/mean": 0.709437906742096, + "rewards/length2tails_reward/std": 0.3272755742073059, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.01859188079834, + "rewards/thermo_reward/std": 1.9252240657806396, + "step": 822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.9375, + "completions/mean_terminated_length": 272.9375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10825805831700563, + "epoch": 1.646, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15281492471694946, + "learning_rate": 1.5487548544591073e-06, + "loss": -0.0031, + "num_tokens": 7160522.0, + "reward": 13.315460205078125, + "reward_std": 1.6204856634140015, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8107447028160095, + "rewards/length2tails_reward/std": 0.2631240785121918, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1740806102752686, + "rewards/thermo_reward/std": 1.5049045085906982, + "step": 823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10472884587943554, + "epoch": 1.6480000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07853143662214279, + "learning_rate": 1.5476824366474754e-06, + "loss": 0.0021, + "num_tokens": 7169238.0, + "reward": 13.464456558227539, + "reward_std": 1.1619950532913208, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7010312676429749, + "rewards/length2tails_reward/std": 0.2853972315788269, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3066892623901367, + "rewards/thermo_reward/std": 1.1649514436721802, + "step": 824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.03125, + "completions/mean_terminated_length": 272.03125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09919088799506426, + "epoch": 1.65, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14531446993350983, + "learning_rate": 1.5466091183096884e-06, + "loss": -0.0041, + "num_tokens": 7177975.0, + "reward": 12.735475540161133, + "reward_std": 2.1816654205322266, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.422218084335327, + "rewards/kidney_reward/std": 0.594012975692749, + "rewards/length2tails_reward/mean": 0.7504751682281494, + "rewards/length2tails_reward/std": 0.272549033164978, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.777024984359741, + "rewards/thermo_reward/std": 1.803603172302246, + "step": 825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09980232920497656, + "epoch": 1.6520000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9415404200553894, + "learning_rate": 1.5455349012105486e-06, + "loss": -0.0053, + "num_tokens": 7186739.0, + "reward": 12.596288681030273, + "reward_std": 2.6109566688537598, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.3617115020751953, + "rewards/kidney_reward/std": 0.9074901938438416, + "rewards/length2tails_reward/mean": 0.7563230395317078, + "rewards/length2tails_reward/std": 0.3114321231842041, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.7552688121795654, + "rewards/thermo_reward/std": 1.6710541248321533, + "step": 826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.71875, + "completions/mean_terminated_length": 272.71875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10263726208359003, + "epoch": 1.654, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07845157384872437, + "learning_rate": 1.5444597871163359e-06, + "loss": 0.0014, + "num_tokens": 7195498.0, + "reward": 13.242258071899414, + "reward_std": 1.4796823263168335, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.650642991065979, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.7627227902412415, + "rewards/length2tails_reward/std": 0.3197879493236542, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.275416851043701, + "rewards/thermo_reward/std": 1.0002073049545288, + "step": 827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.0625, + "completions/mean_terminated_length": 272.0625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10090090520679951, + "epoch": 1.6560000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15500804781913757, + "learning_rate": 1.5433837777948058e-06, + "loss": -0.0038, + "num_tokens": 7204236.0, + "reward": 12.889111518859863, + "reward_std": 2.009411573410034, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4222664833068848, + "rewards/kidney_reward/std": 0.5937747359275818, + "rewards/length2tails_reward/mean": 0.6735501289367676, + "rewards/length2tails_reward/std": 0.3572605550289154, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9383044242858887, + "rewards/thermo_reward/std": 1.5431135892868042, + "step": 828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09415333159267902, + "epoch": 1.658, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19570784270763397, + "learning_rate": 1.5423068750151846e-06, + "loss": -0.0063, + "num_tokens": 7212967.0, + "reward": 13.097809791564941, + "reward_std": 2.0937092304229736, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5107388496398926, + "rewards/kidney_reward/std": 0.5185781121253967, + "rewards/length2tails_reward/mean": 0.6930172443389893, + "rewards/length2tails_reward/std": 0.34632742404937744, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.056584119796753, + "rewards/thermo_reward/std": 1.6953647136688232, + "step": 829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.25, + "completions/mean_terminated_length": 271.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10683467425405979, + "epoch": 1.6600000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21493102610111237, + "learning_rate": 1.5412290805481684e-06, + "loss": -0.0015, + "num_tokens": 7221679.0, + "reward": 12.32304859161377, + "reward_std": 5.012633323669434, + "rewards/fitness_reward/mean": 7.001071453094482, + "rewards/fitness_reward/std": 2.0371105670928955, + "rewards/kidney_reward/mean": 2.3606696128845215, + "rewards/kidney_reward/std": 1.2035049200057983, + "rewards/length2tails_reward/mean": 0.7072513103485107, + "rewards/length2tails_reward/std": 0.3007599115371704, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.7905826568603516, + "rewards/thermo_reward/std": 2.080504894256592, + "step": 830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.21875, + "completions/mean_terminated_length": 272.21875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09869087766855955, + "epoch": 1.662, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13675719499588013, + "learning_rate": 1.5401503961659201e-06, + "loss": 0.0012, + "num_tokens": 7230422.0, + "reward": 13.259733200073242, + "reward_std": 1.3868850469589233, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.79951411485672, + "rewards/length2tails_reward/std": 0.20172974467277527, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1468350887298584, + "rewards/thermo_reward/std": 1.2325853109359741, + "step": 831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.46875, + "completions/mean_terminated_length": 270.46875, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.09462389722466469, + "epoch": 1.6640000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2891161441802979, + "learning_rate": 1.5390708236420645e-06, + "loss": -0.0078, + "num_tokens": 7239109.0, + "reward": 12.70787525177002, + "reward_std": 2.7165496349334717, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.3758201599121094, + "rewards/kidney_reward/std": 0.7904667854309082, + "rewards/length2tails_reward/mean": 0.6734529733657837, + "rewards/length2tails_reward/std": 0.3371480703353882, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8610341548919678, + "rewards/thermo_reward/std": 1.8633620738983154, + "step": 832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09248152980580926, + "epoch": 1.666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08377497643232346, + "learning_rate": 1.5379903647516877e-06, + "loss": -0.0066, + "num_tokens": 7247853.0, + "reward": 13.391145706176758, + "reward_std": 1.536651372909546, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7327719330787659, + "rewards/length2tails_reward/std": 0.3565005660057068, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2575631141662598, + "rewards/thermo_reward/std": 1.441807508468628, + "step": 833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.0968128452077508, + "epoch": 1.6680000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15227289497852325, + "learning_rate": 1.5369090212713322e-06, + "loss": -0.0027, + "num_tokens": 7256617.0, + "reward": 13.647794723510742, + "reward_std": 0.6356459856033325, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7752425670623779, + "rewards/length2tails_reward/std": 0.30526676774024963, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.50996470451355, + "rewards/thermo_reward/std": 0.5615194439888, + "step": 834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.4375, + "completions/mean_terminated_length": 272.4375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1001192070543766, + "epoch": 1.67, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1381293684244156, + "learning_rate": 1.5358267949789964e-06, + "loss": -0.0053, + "num_tokens": 7265367.0, + "reward": 12.299043655395508, + "reward_std": 4.733181953430176, + "rewards/fitness_reward/mean": 6.9439544677734375, + "rewards/fitness_reward/std": 2.050342321395874, + "rewards/kidney_reward/mean": 2.3449745178222656, + "rewards/kidney_reward/std": 1.142852544784546, + "rewards/length2tails_reward/mean": 0.7524625658988953, + "rewards/length2tails_reward/std": 0.2887047231197357, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.834869146347046, + "rewards/thermo_reward/std": 1.8959267139434814, + "step": 835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.96875, + "completions/mean_terminated_length": 270.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09611952304840088, + "epoch": 1.6720000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09297409653663635, + "learning_rate": 1.5347436876541295e-06, + "loss": -0.0029, + "num_tokens": 7274070.0, + "reward": 13.16862964630127, + "reward_std": 1.9593685865402222, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4035873413085938, + "rewards/kidney_reward/std": 0.7815843224525452, + "rewards/length2tails_reward/mean": 0.7112776637077332, + "rewards/length2tails_reward/std": 0.25806063413619995, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.232728958129883, + "rewards/thermo_reward/std": 1.303803563117981, + "step": 836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09741265419870615, + "epoch": 1.674, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09869790822267532, + "learning_rate": 1.533659701077629e-06, + "loss": -0.0073, + "num_tokens": 7282818.0, + "reward": 13.390752792358398, + "reward_std": 1.6376488208770752, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7550690174102783, + "rewards/length2tails_reward/std": 0.30704745650291443, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2549397945404053, + "rewards/thermo_reward/std": 1.5037169456481934, + "step": 837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.5625, + "completions/mean_terminated_length": 272.5625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09892761148512363, + "epoch": 1.6760000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05691608041524887, + "learning_rate": 1.5325748370318383e-06, + "loss": -0.0045, + "num_tokens": 7291572.0, + "reward": 12.643072128295898, + "reward_std": 3.3166003227233887, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.4200446605682373, + "rewards/kidney_reward/std": 0.7146202325820923, + "rewards/length2tails_reward/mean": 0.7664644718170166, + "rewards/length2tails_reward/std": 0.29053762555122375, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9933269023895264, + "rewards/thermo_reward/std": 1.3843075037002563, + "step": 838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.28125, + "completions/mean_terminated_length": 273.28125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10477215051651001, + "epoch": 1.678, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15455517172813416, + "learning_rate": 1.5314890973005445e-06, + "loss": 0.0024, + "num_tokens": 7300349.0, + "reward": 13.205035209655762, + "reward_std": 1.625687599182129, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.8004459142684937, + "rewards/length2tails_reward/std": 0.2789980173110962, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.119403600692749, + "rewards/thermo_reward/std": 1.4113643169403076, + "step": 839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 281.4375, + "completions/mean_terminated_length": 281.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1059467475861311, + "epoch": 1.6800000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.29008674621582, + "learning_rate": 1.530402483668973e-06, + "loss": 0.1573, + "num_tokens": 7309387.0, + "reward": 12.298429489135742, + "reward_std": 5.2238335609436035, + "rewards/fitness_reward/mean": 6.982968330383301, + "rewards/fitness_reward/std": 2.1395151615142822, + "rewards/kidney_reward/mean": 2.219698905944824, + "rewards/kidney_reward/std": 1.50632905960083, + "rewards/length2tails_reward/mean": 0.7398964166641235, + "rewards/length2tails_reward/std": 0.3255484402179718, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9217729568481445, + "rewards/thermo_reward/std": 2.0766680240631104, + "step": 840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.125, + "completions/mean_terminated_length": 271.125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09004503022879362, + "epoch": 1.682, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07574941962957382, + "learning_rate": 1.5293149979237875e-06, + "loss": -0.0046, + "num_tokens": 7318095.0, + "reward": 13.034381866455078, + "reward_std": 2.6836977005004883, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.650642991065979, + "rewards/kidney_reward/mean": 2.4217729568481445, + "rewards/kidney_reward/std": 1.0101513862609863, + "rewards/length2tails_reward/mean": 0.6856233477592468, + "rewards/length2tails_reward/std": 0.33994343876838684, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.197880744934082, + "rewards/thermo_reward/std": 1.4554771184921265, + "step": 841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.96875, + "completions/mean_terminated_length": 272.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0994929876178503, + "epoch": 1.6840000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24888205528259277, + "learning_rate": 1.5282266418530844e-06, + "loss": 0.0007, + "num_tokens": 7326862.0, + "reward": 12.968977928161621, + "reward_std": 4.216531276702881, + "rewards/fitness_reward/mean": 7.013382911682129, + "rewards/fitness_reward/std": 1.9674652814865112, + "rewards/kidney_reward/mean": 2.4493489265441895, + "rewards/kidney_reward/std": 1.0020034313201904, + "rewards/length2tails_reward/mean": 0.8029117584228516, + "rewards/length2tails_reward/std": 0.2570558190345764, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3259544372558594, + "rewards/thermo_reward/std": 1.3275809288024902, + "step": 842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 273.90625, + "completions/mean_terminated_length": 273.90625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13375951629132032, + "epoch": 1.686, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29555225372314453, + "learning_rate": 1.5271374172463922e-06, + "loss": -0.0007, + "num_tokens": 7335659.0, + "reward": 13.37346076965332, + "reward_std": 1.3563597202301025, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7223860025405884, + "rewards/length2tails_reward/std": 0.30389323830604553, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2409167289733887, + "rewards/thermo_reward/std": 1.2600083351135254, + "step": 843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.125, + "completions/mean_terminated_length": 273.125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09282128792256117, + "epoch": 1.688, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11549926549196243, + "learning_rate": 1.526047325894667e-06, + "loss": -0.0006, + "num_tokens": 7344431.0, + "reward": 13.49197006225586, + "reward_std": 1.3360706567764282, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8050989508628845, + "rewards/length2tails_reward/std": 0.2730928063392639, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3511548042297363, + "rewards/thermo_reward/std": 1.1984219551086426, + "step": 844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.40625, + "completions/mean_terminated_length": 273.40625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.11189008224755526, + "epoch": 1.69, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11807180196046829, + "learning_rate": 1.5249563695902903e-06, + "loss": -0.0042, + "num_tokens": 7353212.0, + "reward": 13.080742835998535, + "reward_std": 2.1417932510375977, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5310442447662354, + "rewards/kidney_reward/std": 0.539863646030426, + "rewards/length2tails_reward/mean": 0.8166994452476501, + "rewards/length2tails_reward/std": 0.2542852759361267, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.006843090057373, + "rewards/thermo_reward/std": 1.7348278760910034, + "step": 845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.09375, + "completions/mean_terminated_length": 272.09375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10713509004563093, + "epoch": 1.692, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10242803394794464, + "learning_rate": 1.5238645501270652e-06, + "loss": -0.001, + "num_tokens": 7361951.0, + "reward": 12.868669509887695, + "reward_std": 1.8826740980148315, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5312321186065674, + "rewards/kidney_reward/std": 0.5388016104698181, + "rewards/length2tails_reward/mean": 0.7111541032791138, + "rewards/length2tails_reward/std": 0.3376815617084503, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8051371574401855, + "rewards/thermo_reward/std": 1.533287525177002, + "step": 846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.5, + "completions/mean_terminated_length": 271.5, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09818495530635118, + "epoch": 1.694, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1153191477060318, + "learning_rate": 1.5227718693002153e-06, + "loss": -0.0083, + "num_tokens": 7370671.0, + "reward": 12.53122615814209, + "reward_std": 3.189180850982666, + "rewards/fitness_reward/mean": 7.188657760620117, + "rewards/fitness_reward/std": 0.7179933190345764, + "rewards/kidney_reward/mean": 2.359818458557129, + "rewards/kidney_reward/std": 0.848675012588501, + "rewards/length2tails_reward/mean": 0.7231026291847229, + "rewards/length2tails_reward/std": 0.3215695023536682, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8104398250579834, + "rewards/thermo_reward/std": 1.989101529121399, + "step": 847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.71875, + "completions/mean_terminated_length": 272.71875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09497405029833317, + "epoch": 1.696, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10651351511478424, + "learning_rate": 1.5216783289063785e-06, + "loss": -0.0064, + "num_tokens": 7379430.0, + "reward": 12.566158294677734, + "reward_std": 3.863691806793213, + "rewards/fitness_reward/mean": 6.99554443359375, + "rewards/fitness_reward/std": 1.7628074884414673, + "rewards/kidney_reward/mean": 2.3516533374786377, + "rewards/kidney_reward/std": 1.0224820375442505, + "rewards/length2tails_reward/mean": 0.7389246225357056, + "rewards/length2tails_reward/std": 0.33149829506874084, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0450682640075684, + "rewards/thermo_reward/std": 1.6298531293869019, + "step": 848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.875, + "completions/mean_terminated_length": 271.875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09245220106095076, + "epoch": 1.698, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06719689071178436, + "learning_rate": 1.5205839307436086e-06, + "loss": -0.0062, + "num_tokens": 7388162.0, + "reward": 12.616239547729492, + "reward_std": 4.165755748748779, + "rewards/fitness_reward/mean": 6.682142734527588, + "rewards/fitness_reward/std": 2.684154748916626, + "rewards/kidney_reward/mean": 2.4843716621398926, + "rewards/kidney_reward/std": 0.5299732089042664, + "rewards/length2tails_reward/mean": 0.6952399015426636, + "rewards/length2tails_reward/std": 0.328195184469223, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2802019119262695, + "rewards/thermo_reward/std": 1.1215753555297852, + "step": 849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.1875, + "completions/mean_terminated_length": 272.1875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10125975962728262, + "epoch": 1.7, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6502317190170288, + "learning_rate": 1.5194886766113672e-06, + "loss": -0.0056, + "num_tokens": 7396904.0, + "reward": 12.429765701293945, + "reward_std": 3.9128546714782715, + "rewards/fitness_reward/mean": 6.938035011291504, + "rewards/fitness_reward/std": 1.78042471408844, + "rewards/kidney_reward/mean": 2.3061063289642334, + "rewards/kidney_reward/std": 0.9397987723350525, + "rewards/length2tails_reward/mean": 0.7385504245758057, + "rewards/length2tails_reward/std": 0.3328675329685211, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0117697715759277, + "rewards/thermo_reward/std": 1.61867094039917, + "step": 850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.0, + "completions/mean_terminated_length": 273.0, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09814795758575201, + "epoch": 1.702, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09595108777284622, + "learning_rate": 1.5183925683105251e-06, + "loss": -0.0025, + "num_tokens": 7405672.0, + "reward": 13.521875381469727, + "reward_std": 1.1724050045013428, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7721210718154907, + "rewards/length2tails_reward/std": 0.30832090973854065, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3569979667663574, + "rewards/thermo_reward/std": 1.1689194440841675, + "step": 851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.96875, + "completions/mean_terminated_length": 272.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10071547608822584, + "epoch": 1.704, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07803252339363098, + "learning_rate": 1.5172956076433568e-06, + "loss": -0.0051, + "num_tokens": 7414439.0, + "reward": 13.553016662597656, + "reward_std": 1.0260266065597534, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7786349058151245, + "rewards/length2tails_reward/std": 0.3229070007801056, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.442206621170044, + "rewards/thermo_reward/std": 0.9379639029502869, + "step": 852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.75, + "completions/mean_terminated_length": 272.75, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09173226915299892, + "epoch": 1.706, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10252796113491058, + "learning_rate": 1.5161977964135387e-06, + "loss": -0.0048, + "num_tokens": 7423199.0, + "reward": 13.331336975097656, + "reward_std": 1.8175561428070068, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.475644588470459, + "rewards/kidney_reward/std": 0.5754398703575134, + "rewards/length2tails_reward/mean": 0.7746952772140503, + "rewards/length2tails_reward/std": 0.2770718038082123, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.317038059234619, + "rewards/thermo_reward/std": 1.2796655893325806, + "step": 853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.5, + "completions/mean_terminated_length": 271.5, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09569068439304829, + "epoch": 1.708, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19964933395385742, + "learning_rate": 1.515099136426145e-06, + "loss": 0.0002, + "num_tokens": 7431919.0, + "reward": 13.394378662109375, + "reward_std": 2.037860155105591, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.50772762298584, + "rewards/kidney_reward/std": 0.671763002872467, + "rewards/length2tails_reward/mean": 0.7298898696899414, + "rewards/length2tails_reward/std": 0.26895391941070557, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4099857807159424, + "rewards/thermo_reward/std": 1.0960407257080078, + "step": 854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.0, + "completions/mean_terminated_length": 271.0, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09732735715806484, + "epoch": 1.71, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08672071248292923, + "learning_rate": 1.5139996294876465e-06, + "loss": -0.0021, + "num_tokens": 7440623.0, + "reward": 13.514305114746094, + "reward_std": 1.774261474609375, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5317673683166504, + "rewards/kidney_reward/std": 0.5357728600502014, + "rewards/length2tails_reward/mean": 0.6585451364517212, + "rewards/length2tails_reward/std": 0.3462117314338684, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4554989337921143, + "rewards/thermo_reward/std": 1.2565979957580566, + "step": 855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 269.75, + "completions/mean_terminated_length": 269.75, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 0.10142287611961365, + "epoch": 1.712, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09674447774887085, + "learning_rate": 1.5128992774059062e-06, + "loss": -0.0026, + "num_tokens": 7449287.0, + "reward": 12.82474136352539, + "reward_std": 2.4676270484924316, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.7117809057235718, + "rewards/length2tails_reward/std": 0.26468193531036377, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.747976541519165, + "rewards/thermo_reward/std": 2.237440347671509, + "step": 856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 272.15625, + "completions/mean_terminated_length": 272.15625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09790748357772827, + "epoch": 1.714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09507341682910919, + "learning_rate": 1.511798081990176e-06, + "loss": -0.0018, + "num_tokens": 7458028.0, + "reward": 13.191679954528809, + "reward_std": 1.9008152484893799, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4973230361938477, + "rewards/kidney_reward/std": 0.5914480686187744, + "rewards/length2tails_reward/mean": 0.6992174386978149, + "rewards/length2tails_reward/std": 0.3379260003566742, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.16325044631958, + "rewards/thermo_reward/std": 1.4118142127990723, + "step": 857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 271.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09885876066982746, + "epoch": 1.716, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07631216198205948, + "learning_rate": 1.510696045051096e-06, + "loss": -0.0075, + "num_tokens": 7466746.0, + "reward": 13.255902290344238, + "reward_std": 1.6920017004013062, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4815330505371094, + "rewards/kidney_reward/std": 0.5446864366531372, + "rewards/length2tails_reward/mean": 0.677985429763794, + "rewards/length2tails_reward/std": 0.34782329201698303, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.24538516998291, + "rewards/thermo_reward/std": 1.4120094776153564, + "step": 858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.9375, + "completions/mean_terminated_length": 270.9375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10496858600527048, + "epoch": 1.718, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12887302041053772, + "learning_rate": 1.5095931684006882e-06, + "loss": 0.0028, + "num_tokens": 7475448.0, + "reward": 13.230815887451172, + "reward_std": 1.7123416662216187, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.6752799153327942, + "rewards/length2tails_reward/std": 0.31890955567359924, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1303417682647705, + "rewards/thermo_reward/std": 1.5595749616622925, + "step": 859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 271.09375, + "completions/mean_terminated_length": 271.09375, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.10163374803960323, + "epoch": 1.72, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.293250560760498, + "learning_rate": 1.5084894538523566e-06, + "loss": 0.0121, + "num_tokens": 7484155.0, + "reward": 12.42259407043457, + "reward_std": 3.9458916187286377, + "rewards/fitness_reward/mean": 7.188657283782959, + "rewards/fitness_reward/std": 0.7179933190345764, + "rewards/kidney_reward/mean": 2.3019797801971436, + "rewards/kidney_reward/std": 1.239792823791504, + "rewards/length2tails_reward/mean": 0.7115887999534607, + "rewards/length2tails_reward/std": 0.3159739375114441, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.760798454284668, + "rewards/thermo_reward/std": 2.2771968841552734, + "step": 860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10072409827262163, + "epoch": 1.722, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09440076351165771, + "learning_rate": 1.507384903220882e-06, + "loss": -0.0049, + "num_tokens": 7492876.0, + "reward": 13.463882446289062, + "reward_std": 1.055253028869629, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7122754454612732, + "rewards/length2tails_reward/std": 0.3253532946109772, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.304990291595459, + "rewards/thermo_reward/std": 1.043897032737732, + "step": 861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.84375, + "completions/mean_terminated_length": 272.84375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10071540251374245, + "epoch": 1.724, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.134958416223526, + "learning_rate": 1.506279518322421e-06, + "loss": 0.0022, + "num_tokens": 7501639.0, + "reward": 12.194796562194824, + "reward_std": 3.917198657989502, + "rewards/fitness_reward/mean": 6.863818168640137, + "rewards/fitness_reward/std": 2.238445281982422, + "rewards/kidney_reward/mean": 2.2814149856567383, + "rewards/kidney_reward/std": 0.9608786106109619, + "rewards/length2tails_reward/mean": 0.7513444423675537, + "rewards/length2tails_reward/std": 0.31943386793136597, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.874429702758789, + "rewards/thermo_reward/std": 1.6127493381500244, + "step": 862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.625, + "completions/mean_terminated_length": 272.625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09387815557420254, + "epoch": 1.726, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08303510397672653, + "learning_rate": 1.5051733009745012e-06, + "loss": -0.0002, + "num_tokens": 7510395.0, + "reward": 13.363080978393555, + "reward_std": 2.55647349357605, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.4396750926971436, + "rewards/kidney_reward/std": 0.9102076292037964, + "rewards/length2tails_reward/mean": 0.8173313140869141, + "rewards/length2tails_reward/std": 0.21922850608825684, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.437997817993164, + "rewards/thermo_reward/std": 1.3501050472259521, + "step": 863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 272.5625, + "completions/mean_terminated_length": 272.5625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.10419859923422337, + "epoch": 1.728, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1179058626294136, + "learning_rate": 1.5040662529960187e-06, + "loss": 0.0029, + "num_tokens": 7519149.0, + "reward": 13.316216468811035, + "reward_std": 1.682897686958313, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.456533432006836, + "rewards/kidney_reward/std": 0.5471765995025635, + "rewards/length2tails_reward/mean": 0.8196427822113037, + "rewards/length2tails_reward/std": 0.21029697358608246, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3165335655212402, + "rewards/thermo_reward/std": 1.176936149597168, + "step": 864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10365048982203007, + "epoch": 1.73, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12642066180706024, + "learning_rate": 1.5029583762072355e-06, + "loss": -0.0025, + "num_tokens": 7527884.0, + "reward": 11.681166648864746, + "reward_std": 4.486652374267578, + "rewards/fitness_reward/mean": 7.049827575683594, + "rewards/fitness_reward/std": 1.7613033056259155, + "rewards/kidney_reward/mean": 2.210993766784668, + "rewards/kidney_reward/std": 1.164110779762268, + "rewards/length2tails_reward/mean": 0.702089786529541, + "rewards/length2tails_reward/std": 0.3295642137527466, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.250135898590088, + "rewards/thermo_reward/std": 2.3425369262695312, + "step": 865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.5625, + "completions/mean_terminated_length": 273.5625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10005826782435179, + "epoch": 1.732, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2355157732963562, + "learning_rate": 1.5018496724297775e-06, + "loss": -0.0039, + "num_tokens": 7536670.0, + "reward": 12.99422550201416, + "reward_std": 2.591430425643921, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.46018648147583, + "rewards/kidney_reward/std": 0.6573600769042969, + "rewards/length2tails_reward/mean": 0.8218222856521606, + "rewards/length2tails_reward/std": 0.2646983563899994, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0481810569763184, + "rewards/thermo_reward/std": 1.797224998474121, + "step": 866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.40625, + "completions/mean_terminated_length": 272.40625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0915538715198636, + "epoch": 1.734, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12455841153860092, + "learning_rate": 1.5007401434866288e-06, + "loss": 0.002, + "num_tokens": 7545419.0, + "reward": 13.686896324157715, + "reward_std": 0.6071659326553345, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7674039602279663, + "rewards/length2tails_reward/std": 0.2526814043521881, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5498504638671875, + "rewards/thermo_reward/std": 0.5360844731330872, + "step": 867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.90625, + "completions/mean_terminated_length": 271.90625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09496522974222898, + "epoch": 1.736, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1116664856672287, + "learning_rate": 1.4996297912021315e-06, + "loss": 0.0045, + "num_tokens": 7554152.0, + "reward": 13.388604164123535, + "reward_std": 1.7250062227249146, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5111966133117676, + "rewards/kidney_reward/std": 0.5161065459251404, + "rewards/length2tails_reward/mean": 0.7519122362136841, + "rewards/length2tails_reward/std": 0.28855007886886597, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.341031074523926, + "rewards/thermo_reward/std": 1.2499605417251587, + "step": 868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.4375, + "completions/mean_terminated_length": 272.4375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08507314044982195, + "epoch": 1.738, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9078298807144165, + "learning_rate": 1.4985186174019805e-06, + "loss": -0.0003, + "num_tokens": 7562902.0, + "reward": 11.907567977905273, + "reward_std": 5.713685512542725, + "rewards/fitness_reward/mean": 6.546481132507324, + "rewards/fitness_reward/std": 2.9839866161346436, + "rewards/kidney_reward/mean": 2.199110984802246, + "rewards/kidney_reward/std": 1.447792410850525, + "rewards/length2tails_reward/mean": 0.7063984870910645, + "rewards/length2tails_reward/std": 0.3167133331298828, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9913363456726074, + "rewards/thermo_reward/std": 1.8961188793182373, + "step": 869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.875, + "completions/mean_terminated_length": 271.875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09846653696149588, + "epoch": 1.74, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12279684096574783, + "learning_rate": 1.4974066239132218e-06, + "loss": -0.0032, + "num_tokens": 7571634.0, + "reward": 12.903898239135742, + "reward_std": 2.871889352798462, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.3835763931274414, + "rewards/kidney_reward/std": 0.8628641366958618, + "rewards/length2tails_reward/mean": 0.7258918285369873, + "rewards/length2tails_reward/std": 0.3222261369228363, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0440564155578613, + "rewards/thermo_reward/std": 1.848998785018921, + "step": 870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.75, + "completions/mean_terminated_length": 272.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10395078919827938, + "epoch": 1.742, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1666652113199234, + "learning_rate": 1.4962938125642501e-06, + "loss": 0.0005, + "num_tokens": 7580394.0, + "reward": 12.472291946411133, + "reward_std": 4.9836344718933105, + "rewards/fitness_reward/mean": 6.719928741455078, + "rewards/fitness_reward/std": 2.5253584384918213, + "rewards/kidney_reward/mean": 2.338550567626953, + "rewards/kidney_reward/std": 1.087242603302002, + "rewards/length2tails_reward/mean": 0.7961379289627075, + "rewards/length2tails_reward/std": 0.26709046959877014, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2341983318328857, + "rewards/thermo_reward/std": 1.4723800420761108, + "step": 871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.71875, + "completions/mean_terminated_length": 272.71875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09795780386775732, + "epoch": 1.744, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34581565856933594, + "learning_rate": 1.495180185184803e-06, + "loss": 0.0037, + "num_tokens": 7589153.0, + "reward": 12.669710159301758, + "reward_std": 5.741780757904053, + "rewards/fitness_reward/mean": 6.930096626281738, + "rewards/fitness_reward/std": 2.438606023788452, + "rewards/kidney_reward/mean": 2.3235361576080322, + "rewards/kidney_reward/std": 1.5616281032562256, + "rewards/length2tails_reward/mean": 0.7937979102134705, + "rewards/length2tails_reward/std": 0.24576689302921295, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2366983890533447, + "rewards/thermo_reward/std": 1.8013713359832764, + "step": 872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.1063917875289917, + "epoch": 1.746, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13437135517597198, + "learning_rate": 1.4940657436059616e-06, + "loss": 0.0014, + "num_tokens": 7597884.0, + "reward": 12.676504135131836, + "reward_std": 4.748298168182373, + "rewards/fitness_reward/mean": 7.001252174377441, + "rewards/fitness_reward/std": 2.0360865592956543, + "rewards/kidney_reward/mean": 2.4072861671447754, + "rewards/kidney_reward/std": 1.2399449348449707, + "rewards/length2tails_reward/mean": 0.7146443128585815, + "rewards/length2tails_reward/std": 0.2839556634426117, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0964999198913574, + "rewards/thermo_reward/std": 1.6927036046981812, + "step": 873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.11050812620669603, + "epoch": 1.748, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1196875348687172, + "learning_rate": 1.492950489660145e-06, + "loss": -0.0013, + "num_tokens": 7606642.0, + "reward": 12.8109712600708, + "reward_std": 2.9641854763031006, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.3206534385681152, + "rewards/kidney_reward/std": 0.9683699607849121, + "rewards/length2tails_reward/mean": 0.7905367612838745, + "rewards/length2tails_reward/std": 0.2640564441680908, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0075883865356445, + "rewards/thermo_reward/std": 1.8859575986862183, + "step": 874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09981518425047398, + "epoch": 1.75, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1955115646123886, + "learning_rate": 1.4918344251811078e-06, + "loss": 0.0011, + "num_tokens": 7615390.0, + "reward": 12.619268417358398, + "reward_std": 4.635945796966553, + "rewards/fitness_reward/mean": 7.005709171295166, + "rewards/fitness_reward/std": 2.0108749866485596, + "rewards/kidney_reward/mean": 2.356231451034546, + "rewards/kidney_reward/std": 1.0809255838394165, + "rewards/length2tails_reward/mean": 0.7344810366630554, + "rewards/length2tails_reward/std": 0.33714422583580017, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0838804244995117, + "rewards/thermo_reward/std": 1.7415902614593506, + "step": 875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 272.65625, + "completions/mean_terminated_length": 272.65625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11094592232257128, + "epoch": 1.752, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08621875941753387, + "learning_rate": 1.490717552003938e-06, + "loss": -0.0012, + "num_tokens": 7624147.0, + "reward": 13.21161937713623, + "reward_std": 1.822740077972412, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7981569170951843, + "rewards/length2tails_reward/std": 0.24679215252399445, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0988576412200928, + "rewards/thermo_reward/std": 1.6984132528305054, + "step": 876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.40625, + "completions/mean_terminated_length": 272.40625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10080702416598797, + "epoch": 1.754, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15915358066558838, + "learning_rate": 1.4895998719650523e-06, + "loss": 0.0016, + "num_tokens": 7632896.0, + "reward": 12.766881942749023, + "reward_std": 4.318146228790283, + "rewards/fitness_reward/mean": 6.9977922439575195, + "rewards/fitness_reward/std": 2.055662155151367, + "rewards/kidney_reward/mean": 2.4041614532470703, + "rewards/kidney_reward/std": 0.9624440670013428, + "rewards/length2tails_reward/mean": 0.7578998804092407, + "rewards/length2tails_reward/std": 0.274631142616272, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.189138889312744, + "rewards/thermo_reward/std": 1.492705225944519, + "step": 877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.4375, + "completions/mean_terminated_length": 272.4375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.0937582403421402, + "epoch": 1.756, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0773591473698616, + "learning_rate": 1.4884813869021952e-06, + "loss": -0.0012, + "num_tokens": 7641646.0, + "reward": 13.149742126464844, + "reward_std": 1.9497835636138916, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.436121702194214, + "rewards/kidney_reward/std": 0.6512018442153931, + "rewards/length2tails_reward/mean": 0.7778726816177368, + "rewards/length2tails_reward/std": 0.24082037806510925, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.174647808074951, + "rewards/thermo_reward/std": 1.4494233131408691, + "step": 878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.21875, + "completions/mean_terminated_length": 272.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09923964831978083, + "epoch": 1.758, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13162273168563843, + "learning_rate": 1.4873620986544347e-06, + "loss": 0.0015, + "num_tokens": 7650389.0, + "reward": 12.932975769042969, + "reward_std": 3.2268340587615967, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.650642991065979, + "rewards/kidney_reward/mean": 2.381955623626709, + "rewards/kidney_reward/std": 1.0001392364501953, + "rewards/length2tails_reward/mean": 0.7600358724594116, + "rewards/length2tails_reward/std": 0.27038806676864624, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1288509368896484, + "rewards/thermo_reward/std": 1.8431106805801392, + "step": 879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.34375, + "completions/mean_terminated_length": 271.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09573240112513304, + "epoch": 1.76, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06183909624814987, + "learning_rate": 1.4862420090621581e-06, + "loss": -0.0036, + "num_tokens": 7659104.0, + "reward": 13.072014808654785, + "reward_std": 3.2107491493225098, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.511730670928955, + "rewards/kidney_reward/std": 0.5132253766059875, + "rewards/length2tails_reward/mean": 0.6726909875869751, + "rewards/length2tails_reward/std": 0.34120577573776245, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.339961528778076, + "rewards/thermo_reward/std": 1.349816918373108, + "step": 880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.59375, + "completions/mean_terminated_length": 271.59375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09770231042057276, + "epoch": 1.762, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08552416414022446, + "learning_rate": 1.485121119967072e-06, + "loss": -0.0026, + "num_tokens": 7667827.0, + "reward": 13.009862899780273, + "reward_std": 2.53602933883667, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.460869550704956, + "rewards/kidney_reward/std": 0.792319655418396, + "rewards/length2tails_reward/mean": 0.7326198816299438, + "rewards/length2tails_reward/std": 0.3129405975341797, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0720553398132324, + "rewards/thermo_reward/std": 1.6470363140106201, + "step": 881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.78125, + "completions/mean_terminated_length": 272.78125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.10478513780981302, + "epoch": 1.764, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1234990730881691, + "learning_rate": 1.4839994332121968e-06, + "loss": -0.0003, + "num_tokens": 7676588.0, + "reward": 13.49679946899414, + "reward_std": 1.194043755531311, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.792144775390625, + "rewards/length2tails_reward/std": 0.24374578893184662, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3572797775268555, + "rewards/thermo_reward/std": 1.16750168800354, + "step": 882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.59375, + "completions/mean_terminated_length": 271.59375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09663060214370489, + "epoch": 1.766, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20788384974002838, + "learning_rate": 1.4828769506418643e-06, + "loss": 0.002, + "num_tokens": 7685311.0, + "reward": 13.174077987670898, + "reward_std": 1.7398223876953125, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4979214668273926, + "rewards/kidney_reward/std": 0.5881804823875427, + "rewards/length2tails_reward/mean": 0.7213119864463806, + "rewards/length2tails_reward/std": 0.3026640713214874, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.142841339111328, + "rewards/thermo_reward/std": 1.3288516998291016, + "step": 883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.15625, + "completions/mean_terminated_length": 273.15625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09734796825796366, + "epoch": 1.768, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1145319864153862, + "learning_rate": 1.4817536741017151e-06, + "loss": 0.0006, + "num_tokens": 7694084.0, + "reward": 13.582418441772461, + "reward_std": 0.7006555795669556, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7939411401748657, + "rewards/length2tails_reward/std": 0.2659618556499481, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.470078706741333, + "rewards/thermo_reward/std": 0.5830413699150085, + "step": 884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10007060971111059, + "epoch": 1.77, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07956361025571823, + "learning_rate": 1.4806296054386957e-06, + "loss": -0.0044, + "num_tokens": 7702848.0, + "reward": 13.668158531188965, + "reward_std": 1.0154482126235962, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7641856670379639, + "rewards/length2tails_reward/std": 0.33650773763656616, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5314338207244873, + "rewards/thermo_reward/std": 0.8651680946350098, + "step": 885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.28125, + "completions/mean_terminated_length": 271.28125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09572915825992823, + "epoch": 1.772, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5946225523948669, + "learning_rate": 1.479504746501054e-06, + "loss": -0.0036, + "num_tokens": 7711561.0, + "reward": 13.458148956298828, + "reward_std": 1.678314208984375, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.538149356842041, + "rewards/kidney_reward/std": 0.4996722936630249, + "rewards/length2tails_reward/mean": 0.6941894292831421, + "rewards/length2tails_reward/std": 0.3023573160171509, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4469056129455566, + "rewards/thermo_reward/std": 0.9156622290611267, + "step": 886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.1875, + "completions/mean_terminated_length": 272.1875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.099006243981421, + "epoch": 1.774, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09899494796991348, + "learning_rate": 1.4783790991383378e-06, + "loss": -0.0013, + "num_tokens": 7720303.0, + "reward": 13.350641250610352, + "reward_std": 1.644244909286499, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5380842685699463, + "rewards/kidney_reward/std": 0.5000393986701965, + "rewards/length2tails_reward/mean": 0.7450613379478455, + "rewards/length2tails_reward/std": 0.29141712188720703, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.276865243911743, + "rewards/thermo_reward/std": 1.2720036506652832, + "step": 887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10529050510376692, + "epoch": 1.776, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0689932107925415, + "learning_rate": 1.4772526652013922e-06, + "loss": -0.0061, + "num_tokens": 7729047.0, + "reward": 12.381019592285156, + "reward_std": 3.4639840126037598, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.185664176940918, + "rewards/kidney_reward/std": 1.2540359497070312, + "rewards/length2tails_reward/mean": 0.7414818406105042, + "rewards/length2tails_reward/std": 0.3098289966583252, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.717531681060791, + "rewards/thermo_reward/std": 2.14951491355896, + "step": 888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 272.4375, + "completions/mean_terminated_length": 272.4375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09431830234825611, + "epoch": 1.778, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10811066627502441, + "learning_rate": 1.4761254465423536e-06, + "loss": -0.0004, + "num_tokens": 7737797.0, + "reward": 12.986416816711426, + "reward_std": 2.2744932174682617, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.650642991065979, + "rewards/kidney_reward/mean": 2.3942477703094482, + "rewards/kidney_reward/std": 0.7171282172203064, + "rewards/length2tails_reward/mean": 0.784565806388855, + "rewards/length2tails_reward/std": 0.2758873701095581, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.167546272277832, + "rewards/thermo_reward/std": 1.4129695892333984, + "step": 889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 268.9375, + "completions/mean_terminated_length": 268.9375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "entropy": 0.09791536442935467, + "epoch": 1.78, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6448939442634583, + "learning_rate": 1.474997445014651e-06, + "loss": -0.04, + "num_tokens": 7746435.0, + "reward": 11.629158020019531, + "reward_std": 5.494712829589844, + "rewards/fitness_reward/mean": 6.617328643798828, + "rewards/fitness_reward/std": 2.9274072647094727, + "rewards/kidney_reward/mean": 2.2886364459991455, + "rewards/kidney_reward/std": 1.2951936721801758, + "rewards/length2tails_reward/mean": 0.6833776831626892, + "rewards/length2tails_reward/std": 0.3136868476867676, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.554854393005371, + "rewards/thermo_reward/std": 2.314518928527832, + "step": 890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.40625, + "completions/mean_terminated_length": 271.40625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09088954981416464, + "epoch": 1.782, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13576436042785645, + "learning_rate": 1.4738686624729987e-06, + "loss": -0.0015, + "num_tokens": 7755152.0, + "reward": 12.47293472290039, + "reward_std": 3.1947779655456543, + "rewards/fitness_reward/mean": 7.131148338317871, + "rewards/fitness_reward/std": 0.7751544713973999, + "rewards/kidney_reward/mean": 2.206810474395752, + "rewards/kidney_reward/std": 1.0620572566986084, + "rewards/length2tails_reward/mean": 0.6976714134216309, + "rewards/length2tails_reward/std": 0.3364195227622986, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9652092456817627, + "rewards/thermo_reward/std": 1.6839019060134888, + "step": 891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.4375, + "completions/mean_terminated_length": 272.4375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09869909752160311, + "epoch": 1.784, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10662391781806946, + "learning_rate": 1.472739100773396e-06, + "loss": 0.0029, + "num_tokens": 7763902.0, + "reward": 13.120617866516113, + "reward_std": 1.752073884010315, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4723806381225586, + "rewards/kidney_reward/std": 0.5926058888435364, + "rewards/length2tails_reward/mean": 0.7800338864326477, + "rewards/length2tails_reward/std": 0.26246973872184753, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.109048366546631, + "rewards/thermo_reward/std": 1.3039993047714233, + "step": 892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09561163000762463, + "epoch": 1.786, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10369518399238586, + "learning_rate": 1.4716087617731242e-06, + "loss": -0.0053, + "num_tokens": 7772646.0, + "reward": 13.807502746582031, + "reward_std": 0.530958354473114, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7768849730491638, + "rewards/length2tails_reward/std": 0.2708515226840973, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.6875, + "completions/mean_terminated_length": 271.6875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09534350223839283, + "epoch": 1.788, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08410871028900146, + "learning_rate": 1.4704776473307406e-06, + "loss": -0.0006, + "num_tokens": 7781372.0, + "reward": 13.64209270477295, + "reward_std": 0.6280385255813599, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7182252407073975, + "rewards/length2tails_reward/std": 0.36311620473861694, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.50996470451355, + "rewards/thermo_reward/std": 0.5615194439888, + "step": 894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.0, + "completions/mean_terminated_length": 272.0, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09412556514143944, + "epoch": 1.79, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08788694441318512, + "learning_rate": 1.4693457593060793e-06, + "loss": -0.0019, + "num_tokens": 7790108.0, + "reward": 13.598801612854004, + "reward_std": 0.6531413793563843, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.6841665506362915, + "rewards/length2tails_reward/std": 0.33494260907173157, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.470078706741333, + "rewards/thermo_reward/std": 0.5830413699150085, + "step": 895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.5, + "completions/mean_terminated_length": 270.5, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09312474494799972, + "epoch": 1.792, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07696857303380966, + "learning_rate": 1.4682130995602458e-06, + "loss": -0.0045, + "num_tokens": 7798796.0, + "reward": 12.536702156066895, + "reward_std": 3.5887157917022705, + "rewards/fitness_reward/mean": 7.051756381988525, + "rewards/fitness_reward/std": 1.7503925561904907, + "rewards/kidney_reward/mean": 2.402106523513794, + "rewards/kidney_reward/std": 0.7840635180473328, + "rewards/length2tails_reward/mean": 0.6466629505157471, + "rewards/length2tails_reward/std": 0.30107077956199646, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.918172836303711, + "rewards/thermo_reward/std": 1.7599866390228271, + "step": 896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.78125, + "completions/mean_terminated_length": 272.78125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.10589804220944643, + "epoch": 1.794, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10630529373884201, + "learning_rate": 1.4670796699556143e-06, + "loss": -0.0026, + "num_tokens": 7807557.0, + "reward": 11.9832124710083, + "reward_std": 5.638438701629639, + "rewards/fitness_reward/mean": 6.682827472686768, + "rewards/fitness_reward/std": 2.6741294860839844, + "rewards/kidney_reward/mean": 2.3010666370391846, + "rewards/kidney_reward/std": 1.1823711395263672, + "rewards/length2tails_reward/mean": 0.7523695230484009, + "rewards/length2tails_reward/std": 0.27255868911743164, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8240814208984375, + "rewards/thermo_reward/std": 2.0244994163513184, + "step": 897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0914301173761487, + "epoch": 1.796, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07685575634241104, + "learning_rate": 1.4659454723558246e-06, + "loss": -0.0029, + "num_tokens": 7816305.0, + "reward": 13.886134147644043, + "reward_std": 0.4406987428665161, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7654820680618286, + "rewards/length2tails_reward/std": 0.29283666610717773, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.4375, + "completions/mean_terminated_length": 273.4375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09987590741366148, + "epoch": 1.798, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0851331502199173, + "learning_rate": 1.4648105086257807e-06, + "loss": -0.0058, + "num_tokens": 7825087.0, + "reward": 13.067283630371094, + "reward_std": 4.014723777770996, + "rewards/fitness_reward/mean": 7.032796859741211, + "rewards/fitness_reward/std": 1.8576456308364868, + "rewards/kidney_reward/mean": 2.4413154125213623, + "rewards/kidney_reward/std": 0.9010650515556335, + "rewards/length2tails_reward/mean": 0.8494006395339966, + "rewards/length2tails_reward/std": 0.22823107242584229, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4082322120666504, + "rewards/thermo_reward/std": 1.3034515380859375, + "step": 899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.46875, + "completions/mean_terminated_length": 272.46875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09739398490637541, + "epoch": 1.8, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13894827663898468, + "learning_rate": 1.4636747806316444e-06, + "loss": -0.0016, + "num_tokens": 7833838.0, + "reward": 13.313909530639648, + "reward_std": 1.5198371410369873, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.7758152484893799, + "rewards/length2tails_reward/std": 0.29189664125442505, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.230740547180176, + "rewards/thermo_reward/std": 1.29622483253479, + "step": 900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.09375, + "completions/mean_terminated_length": 272.09375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09956398233771324, + "epoch": 1.802, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12740930914878845, + "learning_rate": 1.4625382902408354e-06, + "loss": 0.0056, + "num_tokens": 7842577.0, + "reward": 13.187545776367188, + "reward_std": 3.881068229675293, + "rewards/fitness_reward/mean": 7.049934387207031, + "rewards/fitness_reward/std": 1.7607014179229736, + "rewards/kidney_reward/mean": 2.4841480255126953, + "rewards/kidney_reward/std": 0.8051493167877197, + "rewards/length2tails_reward/mean": 0.7785500884056091, + "rewards/length2tails_reward/std": 0.22710512578487396, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.475609302520752, + "rewards/thermo_reward/std": 1.3542872667312622, + "step": 901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.1875, + "completions/mean_terminated_length": 273.1875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09471298195421696, + "epoch": 1.804, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12734858691692352, + "learning_rate": 1.4614010393220262e-06, + "loss": -0.0005, + "num_tokens": 7851351.0, + "reward": 13.22269058227539, + "reward_std": 1.6711552143096924, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8141377568244934, + "rewards/length2tails_reward/std": 0.25066789984703064, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1658401489257812, + "rewards/thermo_reward/std": 1.358097791671753, + "step": 902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.1875, + "completions/mean_terminated_length": 271.1875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0919131813570857, + "epoch": 1.806, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08095831423997879, + "learning_rate": 1.4602630297451407e-06, + "loss": -0.0013, + "num_tokens": 7860061.0, + "reward": 13.463017463684082, + "reward_std": 0.956395149230957, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.6921268105506897, + "rewards/length2tails_reward/std": 0.3316554129123688, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3334994316101074, + "rewards/thermo_reward/std": 0.9141343832015991, + "step": 903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.9375, + "completions/mean_terminated_length": 271.9375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09086079522967339, + "epoch": 1.808, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35862961411476135, + "learning_rate": 1.4591242633813494e-06, + "loss": -0.0045, + "num_tokens": 7868795.0, + "reward": 13.240551948547363, + "reward_std": 2.3463644981384277, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.650642991065979, + "rewards/kidney_reward/mean": 2.4671764373779297, + "rewards/kidney_reward/std": 0.6201311349868774, + "rewards/length2tails_reward/mean": 0.7406184673309326, + "rewards/length2tails_reward/std": 0.30301016569137573, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.353147029876709, + "rewards/thermo_reward/std": 1.2772053480148315, + "step": 904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.1875, + "completions/mean_terminated_length": 270.1875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09792979806661606, + "epoch": 1.81, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08443191647529602, + "learning_rate": 1.4579847421030676e-06, + "loss": -0.0028, + "num_tokens": 7877473.0, + "reward": 13.049362182617188, + "reward_std": 2.3406870365142822, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.480879306793213, + "rewards/kidney_reward/std": 0.6816642880439758, + "rewards/length2tails_reward/mean": 0.6595234274864197, + "rewards/length2tails_reward/std": 0.30757832527160645, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0413448810577393, + "rewards/thermo_reward/std": 1.79043447971344, + "step": 905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.5, + "completions/mean_terminated_length": 271.5, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10100154858082533, + "epoch": 1.812, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06824267655611038, + "learning_rate": 1.4568444677839515e-06, + "loss": -0.0032, + "num_tokens": 7886193.0, + "reward": 13.647187232971191, + "reward_std": 0.9054825305938721, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7222320437431335, + "rewards/length2tails_reward/std": 0.28808388113975525, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4872992038726807, + "rewards/thermo_reward/std": 0.9007164835929871, + "step": 906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.1875, + "completions/mean_terminated_length": 273.1875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09898331295698881, + "epoch": 1.814, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10882727801799774, + "learning_rate": 1.4557034422988958e-06, + "loss": 0.0004, + "num_tokens": 7894967.0, + "reward": 11.802138328552246, + "reward_std": 5.2616496086120605, + "rewards/fitness_reward/mean": 6.572333335876465, + "rewards/fitness_reward/std": 2.8820300102233887, + "rewards/kidney_reward/mean": 2.1918716430664062, + "rewards/kidney_reward/std": 1.2480754852294922, + "rewards/length2tails_reward/mean": 0.7903756499290466, + "rewards/length2tails_reward/std": 0.3119876980781555, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.858895778656006, + "rewards/thermo_reward/std": 1.8779559135437012, + "step": 907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.875, + "completions/mean_terminated_length": 271.875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09829057846218348, + "epoch": 1.8159999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1091950312256813, + "learning_rate": 1.4545616675240307e-06, + "loss": 0.0031, + "num_tokens": 7903699.0, + "reward": 13.163553237915039, + "reward_std": 2.364424705505371, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.49746036529541, + "rewards/kidney_reward/std": 0.5906985402107239, + "rewards/length2tails_reward/mean": 0.7393020987510681, + "rewards/length2tails_reward/std": 0.2952393889427185, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.188486337661743, + "rewards/thermo_reward/std": 1.5971673727035522, + "step": 908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.46875, + "completions/mean_terminated_length": 270.46875, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "entropy": 0.09047531802207232, + "epoch": 1.818, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15408550202846527, + "learning_rate": 1.4534191453367172e-06, + "loss": 0.0031, + "num_tokens": 7912386.0, + "reward": 13.424781799316406, + "reward_std": 1.3201360702514648, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.7214326858520508, + "rewards/length2tails_reward/std": 0.29441601037979126, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.347050905227661, + "rewards/thermo_reward/std": 1.1589794158935547, + "step": 909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.0625, + "completions/mean_terminated_length": 273.0625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10163731779903173, + "epoch": 1.8199999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3144538104534149, + "learning_rate": 1.4522758776155464e-06, + "loss": 0.0021, + "num_tokens": 7921156.0, + "reward": 13.272607803344727, + "reward_std": 1.7322739362716675, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.365363597869873, + "rewards/kidney_reward/std": 0.7302173376083374, + "rewards/length2tails_reward/mean": 0.7840762734413147, + "rewards/length2tails_reward/std": 0.2829437255859375, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.36765193939209, + "rewards/thermo_reward/std": 1.1156370639801025, + "step": 910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 271.75, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.0990389846265316, + "epoch": 1.822, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11379135400056839, + "learning_rate": 1.4511318662403345e-06, + "loss": -0.0074, + "num_tokens": 7929884.0, + "reward": 12.687387466430664, + "reward_std": 3.787317991256714, + "rewards/fitness_reward/mean": 6.938035011291504, + "rewards/fitness_reward/std": 1.840762972831726, + "rewards/kidney_reward/mean": 2.409886360168457, + "rewards/kidney_reward/std": 0.755768358707428, + "rewards/length2tails_reward/mean": 0.795867383480072, + "rewards/length2tails_reward/std": 0.2925042510032654, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.159878730773926, + "rewards/thermo_reward/std": 1.4481010437011719, + "step": 911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.8125, + "completions/mean_terminated_length": 271.8125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09324057400226593, + "epoch": 1.8239999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07792110741138458, + "learning_rate": 1.449987113092121e-06, + "loss": -0.0054, + "num_tokens": 7938614.0, + "reward": 13.062980651855469, + "reward_std": 2.5903031826019287, + "rewards/fitness_reward/mean": 6.999163627624512, + "rewards/fitness_reward/std": 2.047900915145874, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.7137687802314758, + "rewards/length2tails_reward/std": 0.3201422393321991, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3480372428894043, + "rewards/thermo_reward/std": 1.0235670804977417, + "step": 912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.125, + "completions/mean_terminated_length": 270.125, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "entropy": 0.09945305716246367, + "epoch": 1.826, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17810320854187012, + "learning_rate": 1.448841620053165e-06, + "loss": -0.0045, + "num_tokens": 7947290.0, + "reward": 12.08353328704834, + "reward_std": 5.4303483963012695, + "rewards/fitness_reward/mean": 6.957911491394043, + "rewards/fitness_reward/std": 2.2812585830688477, + "rewards/kidney_reward/mean": 2.2508323192596436, + "rewards/kidney_reward/std": 1.38154137134552, + "rewards/length2tails_reward/mean": 0.7138035893440247, + "rewards/length2tails_reward/std": 0.3589658737182617, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.70340895652771, + "rewards/thermo_reward/std": 2.210334300994873, + "step": 913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.21875, + "completions/mean_terminated_length": 272.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10191969107836485, + "epoch": 1.8279999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06049925461411476, + "learning_rate": 1.4476953890069415e-06, + "loss": -0.0067, + "num_tokens": 7956033.0, + "reward": 13.038139343261719, + "reward_std": 2.406184196472168, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4171836376190186, + "rewards/kidney_reward/std": 0.6189330220222473, + "rewards/length2tails_reward/mean": 0.7486883401870728, + "rewards/length2tails_reward/std": 0.3089596927165985, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.084902763366699, + "rewards/thermo_reward/std": 1.8648375272750854, + "step": 914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09475038014352322, + "epoch": 1.83, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08417715132236481, + "learning_rate": 1.4465484218381398e-06, + "loss": -0.0048, + "num_tokens": 7964764.0, + "reward": 12.575175285339355, + "reward_std": 3.092132806777954, + "rewards/fitness_reward/mean": 7.073638916015625, + "rewards/fitness_reward/std": 0.8242490291595459, + "rewards/kidney_reward/mean": 2.2160050868988037, + "rewards/kidney_reward/std": 1.0591219663619995, + "rewards/length2tails_reward/mean": 0.7387404441833496, + "rewards/length2tails_reward/std": 0.2980632781982422, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1116576194763184, + "rewards/thermo_reward/std": 1.4491232633590698, + "step": 915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.46875, + "completions/mean_terminated_length": 273.46875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09082639962434769, + "epoch": 1.8319999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07699467241764069, + "learning_rate": 1.445400720432659e-06, + "loss": -0.0081, + "num_tokens": 7973547.0, + "reward": 12.984945297241211, + "reward_std": 2.1997668743133545, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.471334457397461, + "rewards/kidney_reward/std": 0.5981243252754211, + "rewards/length2tails_reward/mean": 0.7991466522216797, + "rewards/length2tails_reward/std": 0.30184128880500793, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9725115299224854, + "rewards/thermo_reward/std": 1.765310287475586, + "step": 916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.46875, + "completions/mean_terminated_length": 272.46875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10166769474744797, + "epoch": 1.834, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10199066251516342, + "learning_rate": 1.444252286677606e-06, + "loss": -0.0009, + "num_tokens": 7982298.0, + "reward": 13.361869812011719, + "reward_std": 1.6339774131774902, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.7698524594306946, + "rewards/length2tails_reward/std": 0.2833806872367859, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2792983055114746, + "rewards/thermo_reward/std": 1.4069561958312988, + "step": 917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.59375, + "completions/mean_terminated_length": 272.59375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0994638055562973, + "epoch": 1.8359999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09854952245950699, + "learning_rate": 1.4431031224612913e-06, + "loss": -0.0054, + "num_tokens": 7991053.0, + "reward": 13.195050239562988, + "reward_std": 2.268061876296997, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.512495756149292, + "rewards/kidney_reward/std": 0.6447898745536804, + "rewards/length2tails_reward/mean": 0.7886925935745239, + "rewards/length2tails_reward/std": 0.23495440185070038, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.200009346008301, + "rewards/thermo_reward/std": 1.5066677331924438, + "step": 918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.84375, + "completions/mean_terminated_length": 273.84375, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.09607025887817144, + "epoch": 1.838, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0761309266090393, + "learning_rate": 1.4419532296732268e-06, + "loss": -0.0036, + "num_tokens": 7999848.0, + "reward": 13.07663345336914, + "reward_std": 2.1593430042266846, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.437307357788086, + "rewards/kidney_reward/std": 0.6450527310371399, + "rewards/length2tails_reward/mean": 0.8511402010917664, + "rewards/length2tails_reward/std": 0.1919945776462555, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1505355834960938, + "rewards/thermo_reward/std": 1.3073687553405762, + "step": 919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.84375, + "completions/mean_terminated_length": 270.84375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08434112183749676, + "epoch": 1.8399999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.112014539539814, + "learning_rate": 1.4408026102041222e-06, + "loss": -0.0042, + "num_tokens": 8008547.0, + "reward": 13.352241516113281, + "reward_std": 1.2534050941467285, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.5977305173873901, + "rewards/length2tails_reward/std": 0.40180954337120056, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.204803228378296, + "rewards/thermo_reward/std": 1.2496801614761353, + "step": 920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.4375, + "completions/mean_terminated_length": 272.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0949926944449544, + "epoch": 1.842, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07384954392910004, + "learning_rate": 1.4396512659458822e-06, + "loss": -0.0045, + "num_tokens": 8017297.0, + "reward": 13.42481517791748, + "reward_std": 1.2391330003738403, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7046566009521484, + "rewards/length2tails_reward/std": 0.3201577961444855, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.294044017791748, + "rewards/thermo_reward/std": 1.095947027206421, + "step": 921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.125, + "completions/mean_terminated_length": 273.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10082920826971531, + "epoch": 1.8439999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06823424249887466, + "learning_rate": 1.4384991987916028e-06, + "loss": -0.0062, + "num_tokens": 8026069.0, + "reward": 12.671717643737793, + "reward_std": 4.562189102172852, + "rewards/fitness_reward/mean": 6.998061180114746, + "rewards/fitness_reward/std": 2.054140329360962, + "rewards/kidney_reward/mean": 2.391406774520874, + "rewards/kidney_reward/std": 1.0328963994979858, + "rewards/length2tails_reward/mean": 0.8234930038452148, + "rewards/length2tails_reward/std": 0.24498499929904938, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.099900722503662, + "rewards/thermo_reward/std": 1.7860603332519531, + "step": 922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.40625, + "completions/mean_terminated_length": 272.40625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10238801874220371, + "epoch": 1.846, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11172787845134735, + "learning_rate": 1.4373464106355695e-06, + "loss": 0.0017, + "num_tokens": 8034818.0, + "reward": 12.963394165039062, + "reward_std": 2.5045695304870605, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.3441781997680664, + "rewards/kidney_reward/std": 0.895361065864563, + "rewards/length2tails_reward/mean": 0.7489575147628784, + "rewards/length2tails_reward/std": 0.32563844323158264, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1406450271606445, + "rewards/thermo_reward/std": 1.433326244354248, + "step": 923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.125, + "completions/mean_terminated_length": 271.125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09571900684386492, + "epoch": 1.8479999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12115728855133057, + "learning_rate": 1.4361929033732526e-06, + "loss": 0.0029, + "num_tokens": 8043526.0, + "reward": 13.049151420593262, + "reward_std": 3.674513339996338, + "rewards/fitness_reward/mean": 7.050969123840332, + "rewards/fitness_reward/std": 1.7548457384109497, + "rewards/kidney_reward/mean": 2.4927873611450195, + "rewards/kidney_reward/std": 0.7562783360481262, + "rewards/length2tails_reward/mean": 0.6843692064285278, + "rewards/length2tails_reward/std": 0.3321457505226135, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3369572162628174, + "rewards/thermo_reward/std": 1.3621774911880493, + "step": 924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10554734244942665, + "epoch": 1.85, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09987124055624008, + "learning_rate": 1.4350386789013044e-06, + "loss": -0.0082, + "num_tokens": 8052284.0, + "reward": 12.666133880615234, + "reward_std": 4.327279090881348, + "rewards/fitness_reward/mean": 6.991957187652588, + "rewards/fitness_reward/std": 1.7827560901641846, + "rewards/kidney_reward/mean": 2.3112308979034424, + "rewards/kidney_reward/std": 1.0905444622039795, + "rewards/length2tails_reward/mean": 0.7796467542648315, + "rewards/length2tails_reward/std": 0.2763543128967285, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.184980869293213, + "rewards/thermo_reward/std": 1.8170864582061768, + "step": 925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.5, + "completions/mean_terminated_length": 272.5, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09814912639558315, + "epoch": 1.8519999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12721046805381775, + "learning_rate": 1.433883739117558e-06, + "loss": -0.0046, + "num_tokens": 8061036.0, + "reward": 13.344095230102539, + "reward_std": 1.9957871437072754, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.495349168777466, + "rewards/kidney_reward/std": 0.6022319197654724, + "rewards/length2tails_reward/mean": 0.7750345468521118, + "rewards/length2tails_reward/std": 0.2605418860912323, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3675663471221924, + "rewards/thermo_reward/std": 1.116063117980957, + "step": 926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.65625, + "completions/mean_terminated_length": 273.65625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09740834962576628, + "epoch": 1.854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.353313148021698, + "learning_rate": 1.432728085921021e-06, + "loss": -0.0035, + "num_tokens": 8069825.0, + "reward": 12.018280982971191, + "reward_std": 5.810842514038086, + "rewards/fitness_reward/mean": 6.850230693817139, + "rewards/fitness_reward/std": 2.312100648880005, + "rewards/kidney_reward/mean": 2.152690887451172, + "rewards/kidney_reward/std": 1.591987133026123, + "rewards/length2tails_reward/mean": 0.8441159129142761, + "rewards/length2tails_reward/std": 0.19898547232151031, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8309483528137207, + "rewards/thermo_reward/std": 2.15840482711792, + "step": 927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.90625, + "completions/mean_terminated_length": 271.90625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09615112002938986, + "epoch": 1.8559999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11188706755638123, + "learning_rate": 1.4315717212118751e-06, + "loss": 0.0013, + "num_tokens": 8078558.0, + "reward": 13.501045227050781, + "reward_std": 1.2505115270614624, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7311096787452698, + "rewards/length2tails_reward/std": 0.2580016255378723, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3676280975341797, + "rewards/thermo_reward/std": 1.1157556772232056, + "step": 928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.21875, + "completions/mean_terminated_length": 273.21875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09848130866885185, + "epoch": 1.858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1136355847120285, + "learning_rate": 1.4304146468914713e-06, + "loss": -0.0023, + "num_tokens": 8087333.0, + "reward": 12.441036224365234, + "reward_std": 5.264050483703613, + "rewards/fitness_reward/mean": 6.976176738739014, + "rewards/fitness_reward/std": 2.177935838699341, + "rewards/kidney_reward/mean": 2.253908157348633, + "rewards/kidney_reward/std": 1.4781852960586548, + "rewards/length2tails_reward/mean": 0.8325543403625488, + "rewards/length2tails_reward/std": 0.1891918182373047, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0276966094970703, + "rewards/thermo_reward/std": 1.9368516206741333, + "step": 929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 271.4375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08642545621842146, + "epoch": 1.8599999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.050249114632606506, + "learning_rate": 1.4292568648623274e-06, + "loss": -0.0048, + "num_tokens": 8096051.0, + "reward": 13.441530227661133, + "reward_std": 2.050464391708374, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4745676517486572, + "rewards/kidney_reward/std": 0.7164823412895203, + "rewards/length2tails_reward/mean": 0.6919739246368408, + "rewards/length2tails_reward/std": 0.32226404547691345, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.436579704284668, + "rewards/thermo_reward/std": 1.415424108505249, + "step": 930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.9375, + "completions/mean_terminated_length": 272.9375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09469401743263006, + "epoch": 1.862, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07521870732307434, + "learning_rate": 1.4280983770281256e-06, + "loss": -0.0055, + "num_tokens": 8104817.0, + "reward": 13.544778823852539, + "reward_std": 1.2863622903823853, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8186159729957581, + "rewards/length2tails_reward/std": 0.23112718760967255, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3752520084381104, + "rewards/thermo_reward/std": 1.2737072706222534, + "step": 931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.59375, + "completions/mean_terminated_length": 272.59375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.10019571334123611, + "epoch": 1.8639999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10404334962368011, + "learning_rate": 1.4269391852937074e-06, + "loss": 0.0045, + "num_tokens": 8113572.0, + "reward": 13.64660358428955, + "reward_std": 0.6189796328544617, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7633348703384399, + "rewards/length2tails_reward/std": 0.3237886428833008, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.50996470451355, + "rewards/thermo_reward/std": 0.5615194439888, + "step": 932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.9375, + "completions/mean_terminated_length": 271.9375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10216997563838959, + "epoch": 1.866, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10921880602836609, + "learning_rate": 1.4257792915650725e-06, + "loss": -0.0061, + "num_tokens": 8122306.0, + "reward": 13.247479438781738, + "reward_std": 2.2764835357666016, + "rewards/fitness_reward/mean": 7.0529656410217285, + "rewards/fitness_reward/std": 1.7435520887374878, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7549443244934082, + "rewards/length2tails_reward/std": 0.2997691333293915, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4472579956054688, + "rewards/thermo_reward/std": 0.9139991998672485, + "step": 933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.3125, + "completions/mean_terminated_length": 271.3125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09624828863888979, + "epoch": 1.8679999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.057099346071481705, + "learning_rate": 1.4246186977493752e-06, + "loss": -0.005, + "num_tokens": 8131020.0, + "reward": 13.203994750976562, + "reward_std": 2.49579119682312, + "rewards/fitness_reward/mean": 7.052845478057861, + "rewards/fitness_reward/std": 1.7442326545715332, + "rewards/kidney_reward/mean": 2.5115790367126465, + "rewards/kidney_reward/std": 0.5140424966812134, + "rewards/length2tails_reward/mean": 0.6949120759963989, + "rewards/length2tails_reward/std": 0.3189202845096588, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.470078706741333, + "rewards/thermo_reward/std": 0.5830413699150085, + "step": 934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08982496801763773, + "epoch": 1.87, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06818225234746933, + "learning_rate": 1.42345740575492e-06, + "loss": 0.0002, + "num_tokens": 8139764.0, + "reward": 13.297567367553711, + "reward_std": 1.665936827659607, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7706277370452881, + "rewards/length2tails_reward/std": 0.2755311131477356, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1601996421813965, + "rewards/thermo_reward/std": 1.5545653104782104, + "step": 935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.0625, + "completions/mean_terminated_length": 273.0625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09777613170444965, + "epoch": 1.8719999999999999, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18864397704601288, + "learning_rate": 1.4222954174911598e-06, + "loss": -0.0027, + "num_tokens": 8148534.0, + "reward": 12.82504653930664, + "reward_std": 2.758099317550659, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.3964080810546875, + "rewards/kidney_reward/std": 0.9116268754005432, + "rewards/length2tails_reward/mean": 0.8018828630447388, + "rewards/length2tails_reward/std": 0.2824718952178955, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8872647285461426, + "rewards/thermo_reward/std": 1.9564694166183472, + "step": 936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.3125, + "completions/mean_terminated_length": 272.3125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09598909690976143, + "epoch": 1.874, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09967732429504395, + "learning_rate": 1.4211327348686924e-06, + "loss": 0.0021, + "num_tokens": 8157280.0, + "reward": 13.60390853881836, + "reward_std": 0.6416917443275452, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7352484464645386, + "rewards/length2tails_reward/std": 0.2844075858592987, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.470078706741333, + "rewards/thermo_reward/std": 0.5830413699150085, + "step": 937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.0, + "completions/mean_terminated_length": 273.0, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09456842020154, + "epoch": 1.876, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0885779857635498, + "learning_rate": 1.4199693597992572e-06, + "loss": -0.0056, + "num_tokens": 8166048.0, + "reward": 13.347528457641602, + "reward_std": 1.448620319366455, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.7930765151977539, + "rewards/length2tails_reward/std": 0.2606636583805084, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.262633800506592, + "rewards/thermo_reward/std": 1.2503079175949097, + "step": 938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.78125, + "completions/mean_terminated_length": 272.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09383744280785322, + "epoch": 1.8780000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12151187658309937, + "learning_rate": 1.4188052941957324e-06, + "loss": 0.0018, + "num_tokens": 8174809.0, + "reward": 13.168033599853516, + "reward_std": 2.0894129276275635, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.478447914123535, + "rewards/kidney_reward/std": 0.6950652003288269, + "rewards/length2tails_reward/mean": 0.8190457820892334, + "rewards/length2tails_reward/std": 0.23952603340148926, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.146496534347534, + "rewards/thermo_reward/std": 1.5423566102981567, + "step": 939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.59375, + "completions/mean_terminated_length": 271.59375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09733179584145546, + "epoch": 1.88, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1220579445362091, + "learning_rate": 1.417640539972131e-06, + "loss": -0.0002, + "num_tokens": 8183532.0, + "reward": 13.365985870361328, + "reward_std": 1.429917812347412, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7476000785827637, + "rewards/length2tails_reward/std": 0.23121827840805054, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.230919599533081, + "rewards/thermo_reward/std": 1.3122392892837524, + "step": 940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.15625, + "completions/mean_terminated_length": 273.15625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09898821823298931, + "epoch": 1.8820000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1083887368440628, + "learning_rate": 1.416475099043599e-06, + "loss": -0.0014, + "num_tokens": 8192305.0, + "reward": 13.080820083618164, + "reward_std": 2.0238304138183594, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.504392147064209, + "rewards/kidney_reward/std": 0.5529485940933228, + "rewards/length2tails_reward/mean": 0.7721121311187744, + "rewards/length2tails_reward/std": 0.30431506037712097, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0380313396453857, + "rewards/thermo_reward/std": 1.6276273727416992, + "step": 941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.875, + "completions/mean_terminated_length": 271.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08173881471157074, + "epoch": 1.884, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07843513041734695, + "learning_rate": 1.4153089733264114e-06, + "loss": -0.0037, + "num_tokens": 8201037.0, + "reward": 13.109209060668945, + "reward_std": 2.7859926223754883, + "rewards/fitness_reward/mean": 6.999617576599121, + "rewards/fitness_reward/std": 2.045334815979004, + "rewards/kidney_reward/mean": 2.511237621307373, + "rewards/kidney_reward/std": 0.5158844590187073, + "rewards/length2tails_reward/mean": 0.6816088557243347, + "rewards/length2tails_reward/std": 0.3428049087524414, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4301929473876953, + "rewards/thermo_reward/std": 0.6010707020759583, + "step": 942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.65625, + "completions/mean_terminated_length": 271.65625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09550878871232271, + "epoch": 1.8860000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12618957459926605, + "learning_rate": 1.4141421647379681e-06, + "loss": 0.0043, + "num_tokens": 8209762.0, + "reward": 13.667505264282227, + "reward_std": 0.5561919808387756, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.6987535953521729, + "rewards/length2tails_reward/std": 0.3205377459526062, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.50996470451355, + "rewards/thermo_reward/std": 0.5615194439888, + "step": 943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.1875, + "completions/mean_terminated_length": 273.1875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09802985563874245, + "epoch": 1.888, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11916353553533554, + "learning_rate": 1.4129746751967933e-06, + "loss": 0.0004, + "num_tokens": 8218536.0, + "reward": 13.113667488098145, + "reward_std": 2.162151336669922, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.3640737533569336, + "rewards/kidney_reward/std": 0.8366342782974243, + "rewards/length2tails_reward/mean": 0.7499675154685974, + "rewards/length2tails_reward/std": 0.3374538719654083, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2134127616882324, + "rewards/thermo_reward/std": 1.3592503070831299, + "step": 944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.10340733919292688, + "epoch": 1.8900000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08188685774803162, + "learning_rate": 1.4118065066225301e-06, + "loss": -0.0066, + "num_tokens": 8227284.0, + "reward": 13.1350736618042, + "reward_std": 2.407517910003662, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.47782564163208, + "rewards/kidney_reward/std": 0.6984977126121521, + "rewards/length2tails_reward/mean": 0.7967540621757507, + "rewards/length2tails_reward/std": 0.2664756178855896, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1738967895507812, + "rewards/thermo_reward/std": 1.5063883066177368, + "step": 945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.75, + "completions/mean_terminated_length": 273.75, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.0962830288335681, + "epoch": 1.892, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08123506605625153, + "learning_rate": 1.410637660935938e-06, + "loss": -0.001, + "num_tokens": 8236076.0, + "reward": 13.664963722229004, + "reward_std": 1.052045464515686, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8287937641143799, + "rewards/length2tails_reward/std": 0.20952561497688293, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.494419574737549, + "rewards/thermo_reward/std": 1.0522606372833252, + "step": 946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.34375, + "completions/mean_terminated_length": 273.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10053523350507021, + "epoch": 1.8940000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15598320960998535, + "learning_rate": 1.4094681400588907e-06, + "loss": 0.0008, + "num_tokens": 8244855.0, + "reward": 13.835688591003418, + "reward_std": 0.4316123425960541, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7851564884185791, + "rewards/length2tails_reward/std": 0.3080328702926636, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.90625, + "completions/mean_terminated_length": 271.90625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10091596748679876, + "epoch": 1.896, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13453346490859985, + "learning_rate": 1.4082979459143704e-06, + "loss": 0.0008, + "num_tokens": 8253588.0, + "reward": 12.257545471191406, + "reward_std": 5.135241508483887, + "rewards/fitness_reward/mean": 6.992251396179199, + "rewards/fitness_reward/std": 2.087003707885742, + "rewards/kidney_reward/mean": 2.2595787048339844, + "rewards/kidney_reward/std": 1.3908474445343018, + "rewards/length2tails_reward/mean": 0.7675119042396545, + "rewards/length2tails_reward/std": 0.26772943139076233, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8289639949798584, + "rewards/thermo_reward/std": 2.066842555999756, + "step": 948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.0, + "completions/mean_terminated_length": 272.0, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09524937998503447, + "epoch": 1.8980000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07262866199016571, + "learning_rate": 1.407127080426468e-06, + "loss": -0.0065, + "num_tokens": 8262324.0, + "reward": 12.8803071975708, + "reward_std": 2.1984105110168457, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.5063095092773438, + "rewards/kidney_reward/std": 0.5425441265106201, + "rewards/length2tails_reward/mean": 0.7491485476493835, + "rewards/length2tails_reward/std": 0.25587987899780273, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.895406723022461, + "rewards/thermo_reward/std": 1.6321258544921875, + "step": 949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.75, + "completions/mean_terminated_length": 272.75, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.0994444964453578, + "epoch": 1.9, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0991891473531723, + "learning_rate": 1.4059555455203776e-06, + "loss": 0.0012, + "num_tokens": 8271084.0, + "reward": 12.7920560836792, + "reward_std": 2.592541456222534, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.4269556999206543, + "rewards/kidney_reward/std": 0.6990845203399658, + "rewards/length2tails_reward/mean": 0.7857171893119812, + "rewards/length2tails_reward/std": 0.24670259654521942, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8828530311584473, + "rewards/thermo_reward/std": 1.8071008920669556, + "step": 950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.78125, + "completions/mean_terminated_length": 272.78125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09295944962650537, + "epoch": 1.9020000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07763893902301788, + "learning_rate": 1.4047833431223936e-06, + "loss": -0.007, + "num_tokens": 8279845.0, + "reward": 12.431122779846191, + "reward_std": 4.762660026550293, + "rewards/fitness_reward/mean": 6.984732151031494, + "rewards/fitness_reward/std": 2.1295392513275146, + "rewards/kidney_reward/mean": 2.347259521484375, + "rewards/kidney_reward/std": 1.1302677392959595, + "rewards/length2tails_reward/mean": 0.7435581684112549, + "rewards/length2tails_reward/std": 0.3555532395839691, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9247758388519287, + "rewards/thermo_reward/std": 1.875200629234314, + "step": 951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.375, + "completions/mean_terminated_length": 273.375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.0887742293998599, + "epoch": 1.904, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08556952327489853, + "learning_rate": 1.403610475159909e-06, + "loss": -0.0001, + "num_tokens": 8288625.0, + "reward": 13.757013320922852, + "reward_std": 0.510677695274353, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7961187362670898, + "rewards/length2tails_reward/std": 0.2826400697231293, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.46875, + "completions/mean_terminated_length": 272.46875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09668416064232588, + "epoch": 1.9060000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0863301232457161, + "learning_rate": 1.40243694356141e-06, + "loss": -0.0057, + "num_tokens": 8297376.0, + "reward": 13.427196502685547, + "reward_std": 1.445143461227417, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5376205444335938, + "rewards/kidney_reward/std": 0.5026634931564331, + "rewards/length2tails_reward/mean": 0.733130156993866, + "rewards/length2tails_reward/std": 0.3547823131084442, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3550782203674316, + "rewards/thermo_reward/std": 0.9904274940490723, + "step": 953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.65625, + "completions/mean_terminated_length": 272.65625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09867602214217186, + "epoch": 1.908, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07859351485967636, + "learning_rate": 1.4012627502564742e-06, + "loss": -0.0033, + "num_tokens": 8306133.0, + "reward": 13.1310453414917, + "reward_std": 1.9072624444961548, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.509688138961792, + "rewards/kidney_reward/std": 0.5242533683776855, + "rewards/length2tails_reward/mean": 0.8021127581596375, + "rewards/length2tails_reward/std": 0.24368120729923248, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0799612998962402, + "rewards/thermo_reward/std": 1.5795716047286987, + "step": 954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10116610862314701, + "epoch": 1.9100000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05824463441967964, + "learning_rate": 1.400087897175768e-06, + "loss": -0.0036, + "num_tokens": 8314854.0, + "reward": 13.070198059082031, + "reward_std": 3.001626491546631, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.511730670928955, + "rewards/kidney_reward/std": 0.5132253766059875, + "rewards/length2tails_reward/mean": 0.6872705817222595, + "rewards/length2tails_reward/std": 0.3044886887073517, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.336686611175537, + "rewards/thermo_reward/std": 0.9002618789672852, + "step": 955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.0625, + "completions/mean_terminated_length": 272.0625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08747619763016701, + "epoch": 1.912, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09268809109926224, + "learning_rate": 1.3989123862510416e-06, + "loss": -0.0013, + "num_tokens": 8323592.0, + "reward": 12.777021408081055, + "reward_std": 3.3635833263397217, + "rewards/fitness_reward/mean": 6.941766738891602, + "rewards/fitness_reward/std": 2.06256365776062, + "rewards/kidney_reward/mean": 2.44405460357666, + "rewards/kidney_reward/std": 0.7441923022270203, + "rewards/length2tails_reward/mean": 0.7442810535430908, + "rewards/length2tails_reward/std": 0.29096719622612, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.21677303314209, + "rewards/thermo_reward/std": 1.5317223072052002, + "step": 956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.21875, + "completions/mean_terminated_length": 272.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10323672275990248, + "epoch": 1.9140000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09083746373653412, + "learning_rate": 1.3977362194151278e-06, + "loss": 0.0029, + "num_tokens": 8332335.0, + "reward": 13.35675048828125, + "reward_std": 1.421925663948059, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.7612141370773315, + "rewards/length2tails_reward/std": 0.26568615436553955, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2750422954559326, + "rewards/thermo_reward/std": 1.289695382118225, + "step": 957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.71875, + "completions/mean_terminated_length": 272.71875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.10143384896218777, + "epoch": 1.916, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07080568373203278, + "learning_rate": 1.3965593986019372e-06, + "loss": -0.0015, + "num_tokens": 8341094.0, + "reward": 13.076738357543945, + "reward_std": 2.0742430686950684, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.7840505838394165, + "rewards/length2tails_reward/std": 0.2203812152147293, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9927470684051514, + "rewards/thermo_reward/std": 1.8821724653244019, + "step": 958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.375, + "completions/mean_terminated_length": 273.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09049487207084894, + "epoch": 1.9180000000000001, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08178161829710007, + "learning_rate": 1.3953819257464558e-06, + "loss": -0.0024, + "num_tokens": 8349874.0, + "reward": 12.805414199829102, + "reward_std": 2.533339738845825, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.4211130142211914, + "rewards/kidney_reward/std": 0.5994554758071899, + "rewards/length2tails_reward/mean": 0.7625944018363953, + "rewards/length2tails_reward/std": 0.31074151396751404, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9043655395507812, + "rewards/thermo_reward/std": 1.8625069856643677, + "step": 959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.8125, + "completions/mean_terminated_length": 273.8125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09283266495913267, + "epoch": 1.92, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06749741733074188, + "learning_rate": 1.3942038027847423e-06, + "loss": -0.005, + "num_tokens": 8358668.0, + "reward": 12.34156322479248, + "reward_std": 5.33221435546875, + "rewards/fitness_reward/mean": 6.687288284301758, + "rewards/fitness_reward/std": 2.662079334259033, + "rewards/kidney_reward/mean": 2.3554515838623047, + "rewards/kidney_reward/std": 1.135952353477478, + "rewards/length2tails_reward/mean": 0.797242283821106, + "rewards/length2tails_reward/std": 0.2937704026699066, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1190998554229736, + "rewards/thermo_reward/std": 1.6363537311553955, + "step": 960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09066375996917486, + "epoch": 1.9220000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12556754052639008, + "learning_rate": 1.3930250316539235e-06, + "loss": -0.0062, + "num_tokens": 8367389.0, + "reward": 12.197015762329102, + "reward_std": 4.578823566436768, + "rewards/fitness_reward/mean": 6.927325248718262, + "rewards/fitness_reward/std": 1.897562026977539, + "rewards/kidney_reward/mean": 2.257803201675415, + "rewards/kidney_reward/std": 1.0970770120620728, + "rewards/length2tails_reward/mean": 0.6851399540901184, + "rewards/length2tails_reward/std": 0.34955552220344543, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8433737754821777, + "rewards/thermo_reward/std": 2.145561695098877, + "step": 961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.90625, + "completions/mean_terminated_length": 272.90625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09863616060465574, + "epoch": 1.924, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10057486593723297, + "learning_rate": 1.3918456142921925e-06, + "loss": 0.0015, + "num_tokens": 8376154.0, + "reward": 13.623000144958496, + "reward_std": 0.683121919631958, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8008888959884644, + "rewards/length2tails_reward/std": 0.2569819390773773, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.50996470451355, + "rewards/thermo_reward/std": 0.5615194439888, + "step": 962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.0, + "completions/mean_terminated_length": 273.0, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.10349648538976908, + "epoch": 1.9260000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10809936374425888, + "learning_rate": 1.390665552638805e-06, + "loss": -0.0031, + "num_tokens": 8384922.0, + "reward": 13.108209609985352, + "reward_std": 2.479018211364746, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.446418523788452, + "rewards/kidney_reward/std": 0.8726393580436707, + "rewards/length2tails_reward/mean": 0.8275328874588013, + "rewards/length2tails_reward/std": 0.23122048377990723, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1178531646728516, + "rewards/thermo_reward/std": 1.775908350944519, + "step": 963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.8125, + "completions/mean_terminated_length": 270.8125, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.09821292012929916, + "epoch": 1.928, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2729121446609497, + "learning_rate": 1.3894848486340754e-06, + "loss": 0.0046, + "num_tokens": 8393620.0, + "reward": 13.802491188049316, + "reward_std": 0.5193327069282532, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7267663478851318, + "rewards/length2tails_reward/std": 0.33441072702407837, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.5625, + "completions/mean_terminated_length": 272.5625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09538561757653952, + "epoch": 1.9300000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08171401172876358, + "learning_rate": 1.388303504219375e-06, + "loss": -0.0053, + "num_tokens": 8402374.0, + "reward": 13.652557373046875, + "reward_std": 0.9250484108924866, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7945376634597778, + "rewards/length2tails_reward/std": 0.27631810307502747, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4854390621185303, + "rewards/thermo_reward/std": 0.9097126126289368, + "step": 965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.59375, + "completions/mean_terminated_length": 272.59375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10812081210315228, + "epoch": 1.932, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11137594282627106, + "learning_rate": 1.387121521337128e-06, + "loss": 0.0014, + "num_tokens": 8411129.0, + "reward": 13.556892395019531, + "reward_std": 0.6138424277305603, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.789208710193634, + "rewards/length2tails_reward/std": 0.28543025255203247, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3903069496154785, + "rewards/thermo_reward/std": 0.6159141063690186, + "step": 966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 271.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09557634359225631, + "epoch": 1.9340000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05619344115257263, + "learning_rate": 1.3859389019308082e-06, + "loss": -0.0061, + "num_tokens": 8419857.0, + "reward": 13.010406494140625, + "reward_std": 3.1404671669006348, + "rewards/fitness_reward/mean": 6.938035011291504, + "rewards/fitness_reward/std": 1.840762972831726, + "rewards/kidney_reward/mean": 2.4843714237213135, + "rewards/kidney_reward/std": 0.5299732685089111, + "rewards/length2tails_reward/mean": 0.7154085636138916, + "rewards/length2tails_reward/std": 0.319457471370697, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4164581298828125, + "rewards/thermo_reward/std": 0.8831133842468262, + "step": 967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.4375, + "completions/mean_terminated_length": 272.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09051778819411993, + "epoch": 1.936, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13804982602596283, + "learning_rate": 1.384755647944936e-06, + "loss": -0.005, + "num_tokens": 8428607.0, + "reward": 12.845332145690918, + "reward_std": 3.591193199157715, + "rewards/fitness_reward/mean": 7.020461559295654, + "rewards/fitness_reward/std": 1.9274237155914307, + "rewards/kidney_reward/mean": 2.483234405517578, + "rewards/kidney_reward/std": 0.5358594655990601, + "rewards/length2tails_reward/mean": 0.7685626745223999, + "rewards/length2tails_reward/std": 0.31856513023376465, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1647799015045166, + "rewards/thermo_reward/std": 1.5537383556365967, + "step": 968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.5, + "completions/mean_terminated_length": 273.5, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.10311949905008078, + "epoch": 1.938, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1776103377342224, + "learning_rate": 1.3835717613250753e-06, + "loss": -0.0019, + "num_tokens": 8437391.0, + "reward": 13.27910041809082, + "reward_std": 2.4083566665649414, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.422438621520996, + "rewards/kidney_reward/std": 0.8103145360946655, + "rewards/length2tails_reward/mean": 0.8479813933372498, + "rewards/length2tails_reward/std": 0.2027747631072998, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3681869506835938, + "rewards/thermo_reward/std": 1.3651937246322632, + "step": 969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.3125, + "completions/mean_terminated_length": 272.3125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09956981893628836, + "epoch": 1.94, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08768046647310257, + "learning_rate": 1.38238724401783e-06, + "loss": -0.0042, + "num_tokens": 8446137.0, + "reward": 12.956103324890137, + "reward_std": 3.2299368381500244, + "rewards/fitness_reward/mean": 6.987481117248535, + "rewards/fitness_reward/std": 2.113990306854248, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7534193396568298, + "rewards/length2tails_reward/std": 0.2735549807548523, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2215192317962646, + "rewards/thermo_reward/std": 1.3512930870056152, + "step": 970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.03125, + "completions/mean_terminated_length": 273.03125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10589835606515408, + "epoch": 1.942, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10970190912485123, + "learning_rate": 1.3812020979708417e-06, + "loss": -0.0011, + "num_tokens": 8454906.0, + "reward": 12.781929969787598, + "reward_std": 2.1080613136291504, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4233293533325195, + "rewards/kidney_reward/std": 0.5885559916496277, + "rewards/length2tails_reward/mean": 0.7808050513267517, + "rewards/length2tails_reward/std": 0.2904893755912781, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8193349838256836, + "rewards/thermo_reward/std": 1.6535685062408447, + "step": 971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.1875, + "completions/mean_terminated_length": 272.1875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09761006757616997, + "epoch": 1.944, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1761743128299713, + "learning_rate": 1.3800163251327849e-06, + "loss": -0.0004, + "num_tokens": 8463648.0, + "reward": 12.051850318908691, + "reward_std": 5.621099472045898, + "rewards/fitness_reward/mean": 6.686112403869629, + "rewards/fitness_reward/std": 2.667112112045288, + "rewards/kidney_reward/mean": 2.3069543838500977, + "rewards/kidney_reward/std": 1.2476999759674072, + "rewards/length2tails_reward/mean": 0.7506591081619263, + "rewards/length2tails_reward/std": 0.3087124824523926, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8837180137634277, + "rewards/thermo_reward/std": 2.179638147354126, + "step": 972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.125, + "completions/mean_terminated_length": 273.125, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.09852071199566126, + "epoch": 1.946, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07111741602420807, + "learning_rate": 1.3788299274533647e-06, + "loss": -0.001, + "num_tokens": 8472420.0, + "reward": 13.54722785949707, + "reward_std": 0.9824661612510681, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8295840620994568, + "rewards/length2tails_reward/std": 0.19819991290569305, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4039645195007324, + "rewards/thermo_reward/std": 0.9403955340385437, + "step": 973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.75, + "completions/mean_terminated_length": 270.75, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "entropy": 0.09349171537905931, + "epoch": 1.948, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.46533671021461487, + "learning_rate": 1.377642906883315e-06, + "loss": -0.013, + "num_tokens": 8481116.0, + "reward": 13.117490768432617, + "reward_std": 1.789286494255066, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.650642991065979, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.717496931552887, + "rewards/length2tails_reward/std": 0.3229467272758484, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.100454807281494, + "rewards/thermo_reward/std": 1.4970639944076538, + "step": 974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.59375, + "completions/mean_terminated_length": 272.59375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09781383909285069, + "epoch": 1.95, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05911961942911148, + "learning_rate": 1.3764552653743919e-06, + "loss": -0.0044, + "num_tokens": 8489871.0, + "reward": 13.45779800415039, + "reward_std": 1.3013496398925781, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7833488583564758, + "rewards/length2tails_reward/std": 0.2513625919818878, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.346517562866211, + "rewards/thermo_reward/std": 1.1603977680206299, + "step": 975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.1875, + "completions/mean_terminated_length": 272.1875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09864049591124058, + "epoch": 1.952, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07618109881877899, + "learning_rate": 1.3752670048793743e-06, + "loss": -0.0041, + "num_tokens": 8498613.0, + "reward": 12.933008193969727, + "reward_std": 2.295651435852051, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.517043113708496, + "rewards/kidney_reward/std": 0.2941751182079315, + "rewards/length2tails_reward/mean": 0.7475055456161499, + "rewards/length2tails_reward/std": 0.30367472767829895, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8800292015075684, + "rewards/thermo_reward/std": 1.998351812362671, + "step": 976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.78125, + "completions/mean_terminated_length": 272.78125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10363372974097729, + "epoch": 1.954, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07959149032831192, + "learning_rate": 1.3740781273520572e-06, + "loss": -0.006, + "num_tokens": 8507374.0, + "reward": 12.797253608703613, + "reward_std": 2.882831335067749, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.3329405784606934, + "rewards/kidney_reward/std": 0.9683598279953003, + "rewards/length2tails_reward/mean": 0.7689581513404846, + "rewards/length2tails_reward/std": 0.27883997559547424, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.983741283416748, + "rewards/thermo_reward/std": 1.7046470642089844, + "step": 977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.0, + "completions/mean_terminated_length": 272.0, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09072333294898272, + "epoch": 1.956, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09165831655263901, + "learning_rate": 1.3728886347472515e-06, + "loss": 0.006, + "num_tokens": 8516110.0, + "reward": 13.664487838745117, + "reward_std": 0.9979863166809082, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7295340895652771, + "rewards/length2tails_reward/std": 0.2998608946800232, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.531228542327881, + "rewards/thermo_reward/std": 0.8661776781082153, + "step": 978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.125, + "completions/mean_terminated_length": 271.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09883951116353273, + "epoch": 1.958, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09821531176567078, + "learning_rate": 1.3716985290207786e-06, + "loss": -0.0055, + "num_tokens": 8524818.0, + "reward": 13.384990692138672, + "reward_std": 1.777211308479309, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7037378549575806, + "rewards/length2tails_reward/std": 0.26936420798301697, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2816712856292725, + "rewards/thermo_reward/std": 1.5554665327072144, + "step": 979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09577017091214657, + "epoch": 1.96, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10303295403718948, + "learning_rate": 1.3705078121294688e-06, + "loss": -0.0015, + "num_tokens": 8533576.0, + "reward": 13.507298469543457, + "reward_std": 1.4854562282562256, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8118197917938232, + "rewards/length2tails_reward/std": 0.20147258043289185, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3931705951690674, + "rewards/thermo_reward/std": 1.2787867784500122, + "step": 980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.03125, + "completions/mean_terminated_length": 272.03125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10504948906600475, + "epoch": 1.962, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12444677203893661, + "learning_rate": 1.3693164860311562e-06, + "loss": 0.0027, + "num_tokens": 8542313.0, + "reward": 13.287333488464355, + "reward_std": 1.4708105325698853, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7800248861312866, + "rewards/length2tails_reward/std": 0.22505053877830505, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.176384925842285, + "rewards/thermo_reward/std": 1.3494049310684204, + "step": 981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.0, + "completions/mean_terminated_length": 273.0, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11124454904347658, + "epoch": 1.964, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10324909538030624, + "learning_rate": 1.3681245526846781e-06, + "loss": -0.0022, + "num_tokens": 8551081.0, + "reward": 13.270578384399414, + "reward_std": 1.4656739234924316, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.517043113708496, + "rewards/kidney_reward/std": 0.2941751182079315, + "rewards/length2tails_reward/mean": 0.800129771232605, + "rewards/length2tails_reward/std": 0.2642292380332947, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.212337017059326, + "rewards/thermo_reward/std": 1.2223610877990723, + "step": 982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.1875, + "completions/mean_terminated_length": 273.1875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09352367091923952, + "epoch": 1.966, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0937211737036705, + "learning_rate": 1.3669320140498683e-06, + "loss": -0.0034, + "num_tokens": 8559855.0, + "reward": 13.362506866455078, + "reward_std": 1.3659077882766724, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7871782779693604, + "rewards/length2tails_reward/std": 0.2870863676071167, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.250842571258545, + "rewards/thermo_reward/std": 1.3097805976867676, + "step": 983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.15625, + "completions/mean_terminated_length": 273.15625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09805367235094309, + "epoch": 1.968, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07226665318012238, + "learning_rate": 1.3657388720875579e-06, + "loss": -0.0034, + "num_tokens": 8568628.0, + "reward": 13.506750106811523, + "reward_std": 1.1540277004241943, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7862600088119507, + "rewards/length2tails_reward/std": 0.2653719186782837, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3951773643493652, + "rewards/thermo_reward/std": 0.9817285537719727, + "step": 984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 274.25, + "completions/mean_terminated_length": 274.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10598085541278124, + "epoch": 1.97, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07709791511297226, + "learning_rate": 1.3645451287595686e-06, + "loss": -0.0035, + "num_tokens": 8577436.0, + "reward": 13.531421661376953, + "reward_std": 1.180441975593567, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8664608597755432, + "rewards/length2tails_reward/std": 0.21417482197284698, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3571105003356934, + "rewards/thermo_reward/std": 1.1683533191680908, + "step": 985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 274.03125, + "completions/mean_terminated_length": 274.03125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.0898081511259079, + "epoch": 1.972, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07521162182092667, + "learning_rate": 1.3633507860287114e-06, + "loss": -0.0047, + "num_tokens": 8586237.0, + "reward": 13.374664306640625, + "reward_std": 1.6267354488372803, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8432106971740723, + "rewards/length2tails_reward/std": 0.21972203254699707, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.257397413253784, + "rewards/thermo_reward/std": 1.4419727325439453, + "step": 986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.125, + "completions/mean_terminated_length": 272.125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09460345283150673, + "epoch": 1.974, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10553532838821411, + "learning_rate": 1.3621558458587833e-06, + "loss": -0.0049, + "num_tokens": 8594977.0, + "reward": 12.886335372924805, + "reward_std": 4.894617557525635, + "rewards/fitness_reward/mean": 6.972713470458984, + "rewards/fitness_reward/std": 2.197526216506958, + "rewards/kidney_reward/mean": 2.406907796859741, + "rewards/kidney_reward/std": 1.242085576057434, + "rewards/length2tails_reward/mean": 0.7495771050453186, + "rewards/length2tails_reward/std": 0.2758795917034149, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.331756591796875, + "rewards/thermo_reward/std": 1.5028119087219238, + "step": 987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.71875, + "completions/mean_terminated_length": 271.71875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09682532027363777, + "epoch": 1.976, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13430586457252502, + "learning_rate": 1.3609603102145623e-06, + "loss": -0.0037, + "num_tokens": 8603704.0, + "reward": 12.990877151489258, + "reward_std": 3.0743181705474854, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.5390896797180176, + "rewards/kidney_reward/std": 0.49435171484947205, + "rewards/length2tails_reward/mean": 0.7251232862472534, + "rewards/length2tails_reward/std": 0.30888208746910095, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2262210845947266, + "rewards/thermo_reward/std": 1.1746268272399902, + "step": 988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.21875, + "completions/mean_terminated_length": 273.21875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10247461404651403, + "epoch": 1.978, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1997416615486145, + "learning_rate": 1.359764181061807e-06, + "loss": -0.0028, + "num_tokens": 8612479.0, + "reward": 12.312665939331055, + "reward_std": 3.448193311691284, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.452592670917511, + "rewards/kidney_reward/mean": 2.255847454071045, + "rewards/kidney_reward/std": 1.216636300086975, + "rewards/length2tails_reward/mean": 0.788469672203064, + "rewards/length2tails_reward/std": 0.3180660903453827, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.631805419921875, + "rewards/thermo_reward/std": 2.0575854778289795, + "step": 989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.125, + "completions/mean_terminated_length": 272.125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09506144933402538, + "epoch": 1.98, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11303427815437317, + "learning_rate": 1.3585674603672507e-06, + "loss": 0.0027, + "num_tokens": 8621219.0, + "reward": 13.591766357421875, + "reward_std": 1.0942988395690918, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7696424722671509, + "rewards/length2tails_reward/std": 0.2485266625881195, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4818553924560547, + "rewards/thermo_reward/std": 0.9271373748779297, + "step": 990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09461696818470955, + "epoch": 1.982, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13793237507343292, + "learning_rate": 1.357370150098601e-06, + "loss": -0.0026, + "num_tokens": 8629940.0, + "reward": 12.687845230102539, + "reward_std": 4.5070953369140625, + "rewards/fitness_reward/mean": 7.020533561706543, + "rewards/fitness_reward/std": 1.927014708518982, + "rewards/kidney_reward/mean": 2.4172534942626953, + "rewards/kidney_reward/std": 1.0354214906692505, + "rewards/length2tails_reward/mean": 0.7397796511650085, + "rewards/length2tails_reward/std": 0.26660993695259094, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.076079845428467, + "rewards/thermo_reward/std": 2.0029871463775635, + "step": 991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "entropy": 0.10359423235058784, + "epoch": 1.984, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1719888299703598, + "learning_rate": 1.3561722522245325e-06, + "loss": 0.0036, + "num_tokens": 8638684.0, + "reward": 13.72146987915039, + "reward_std": 0.5342023968696594, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8395423889160156, + "rewards/length2tails_reward/std": 0.17699043452739716, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5498507022857666, + "rewards/thermo_reward/std": 0.5360844731330872, + "step": 992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.1875, + "completions/mean_terminated_length": 273.1875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08843657933175564, + "epoch": 1.986, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16318239271640778, + "learning_rate": 1.3549737687146882e-06, + "loss": 0.0034, + "num_tokens": 8647458.0, + "reward": 13.68990707397461, + "reward_std": 0.6028597950935364, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7975119352340698, + "rewards/length2tails_reward/std": 0.2890626788139343, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5498507022857666, + "rewards/thermo_reward/std": 0.5360844731330872, + "step": 993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.75, + "completions/mean_terminated_length": 270.75, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.10393639095127583, + "epoch": 1.988, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06861487030982971, + "learning_rate": 1.3537747015396723e-06, + "loss": 0.0001, + "num_tokens": 8656154.0, + "reward": 12.781076431274414, + "reward_std": 3.3552956581115723, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.511730670928955, + "rewards/kidney_reward/std": 0.5132253766059875, + "rewards/length2tails_reward/mean": 0.7422202825546265, + "rewards/length2tails_reward/std": 0.2663733661174774, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0420706272125244, + "rewards/thermo_reward/std": 1.7657049894332886, + "step": 994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.625, + "completions/mean_terminated_length": 272.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09462670609354973, + "epoch": 1.99, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13636404275894165, + "learning_rate": 1.3525750526710499e-06, + "loss": -0.001, + "num_tokens": 8664910.0, + "reward": 12.781831741333008, + "reward_std": 4.220463275909424, + "rewards/fitness_reward/mean": 7.030411243438721, + "rewards/fitness_reward/std": 1.871139645576477, + "rewards/kidney_reward/mean": 2.3933980464935303, + "rewards/kidney_reward/std": 1.021881341934204, + "rewards/length2tails_reward/mean": 0.7944914102554321, + "rewards/length2tails_reward/std": 0.25824275612831116, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1785738468170166, + "rewards/thermo_reward/std": 1.5321305990219116, + "step": 995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 667.0, + "completions/max_terminated_length": 667.0, + "completions/mean_length": 284.4375, + "completions/mean_terminated_length": 284.4375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.1007428178563714, + "epoch": 1.992, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5512227416038513, + "learning_rate": 1.3513748240813427e-06, + "loss": -0.0161, + "num_tokens": 8674044.0, + "reward": 13.84260368347168, + "reward_std": 0.4883881211280823, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7290430068969727, + "rewards/length2tails_reward/std": 0.30068859457969666, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 271.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09722385462373495, + "epoch": 1.994, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07077227532863617, + "learning_rate": 1.350174017744024e-06, + "loss": -0.0004, + "num_tokens": 8682772.0, + "reward": 12.302651405334473, + "reward_std": 5.062031269073486, + "rewards/fitness_reward/mean": 7.00186824798584, + "rewards/fitness_reward/std": 2.0326035022735596, + "rewards/kidney_reward/mean": 2.200948476791382, + "rewards/kidney_reward/std": 1.4013389348983765, + "rewards/length2tails_reward/mean": 0.760050356388092, + "rewards/length2tails_reward/std": 0.2809792160987854, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9238295555114746, + "rewards/thermo_reward/std": 2.1210215091705322, + "step": 997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09150166623294353, + "epoch": 1.996, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07145685702562332, + "learning_rate": 1.3489726356335189e-06, + "loss": 0.0029, + "num_tokens": 8691530.0, + "reward": 13.876331329345703, + "reward_std": 0.37368762493133545, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7927263975143433, + "rewards/length2tails_reward/std": 0.25554361939430237, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.4375, + "completions/mean_terminated_length": 270.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0959260631352663, + "epoch": 1.998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1562761813402176, + "learning_rate": 1.3477706797251984e-06, + "loss": -0.003, + "num_tokens": 8700216.0, + "reward": 12.899539947509766, + "reward_std": 2.173999786376953, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.3832297325134277, + "rewards/kidney_reward/std": 0.7602803111076355, + "rewards/length2tails_reward/mean": 0.6400865912437439, + "rewards/length2tails_reward/std": 0.33226698637008667, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9911162853240967, + "rewards/thermo_reward/std": 1.7141780853271484, + "step": 999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 272.625, + "completions/mean_terminated_length": 272.625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.09887748025357723, + "epoch": 2.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08962114155292511, + "learning_rate": 1.3465681519953763e-06, + "loss": 0.0021, + "num_tokens": 8708972.0, + "reward": 13.58544921875, + "reward_std": 0.9744566082954407, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8294492959976196, + "rewards/length2tails_reward/std": 0.20147967338562012, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4421987533569336, + "rewards/thermo_reward/std": 0.9380018711090088, + "step": 1000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.03125, + "completions/mean_terminated_length": 273.03125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09856892470270395, + "epoch": 2.002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1271350234746933, + "learning_rate": 1.3453650544213076e-06, + "loss": -0.0074, + "num_tokens": 8717741.0, + "reward": 13.097925186157227, + "reward_std": 2.1939687728881836, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.650642991065979, + "rewards/kidney_reward/mean": 2.4370408058166504, + "rewards/kidney_reward/std": 0.6464345455169678, + "rewards/length2tails_reward/mean": 0.7915678024291992, + "rewards/length2tails_reward/std": 0.2875259518623352, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2355613708496094, + "rewards/thermo_reward/std": 1.5196568965911865, + "step": 1001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.5, + "completions/mean_terminated_length": 273.5, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09953618701547384, + "epoch": 2.004, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11234353482723236, + "learning_rate": 1.3441613889811842e-06, + "loss": 0.0046, + "num_tokens": 8726525.0, + "reward": 13.733948707580566, + "reward_std": 0.576275110244751, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8390617370605469, + "rewards/length2tails_reward/std": 0.21928992867469788, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 1002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.5625, + "completions/mean_terminated_length": 272.5625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10064785089343786, + "epoch": 2.006, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1869335174560547, + "learning_rate": 1.3429571576541314e-06, + "loss": 0.0006, + "num_tokens": 8735279.0, + "reward": 13.310781478881836, + "reward_std": 2.0318822860717773, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.448695182800293, + "rewards/kidney_reward/std": 0.7191016674041748, + "rewards/length2tails_reward/mean": 0.7878056764602661, + "rewards/length2tails_reward/std": 0.27009204030036926, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3796305656433105, + "rewards/thermo_reward/std": 1.0566301345825195, + "step": 1003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.03125, + "completions/mean_terminated_length": 272.03125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08979780692607164, + "epoch": 2.008, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05455811321735382, + "learning_rate": 1.3417523624202052e-06, + "loss": -0.0046, + "num_tokens": 8744016.0, + "reward": 13.631386756896973, + "reward_std": 1.6224788427352905, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7611433267593384, + "rewards/length2tails_reward/std": 0.2798122763633728, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.494967460632324, + "rewards/thermo_reward/std": 1.4581682682037354, + "step": 1004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.78125, + "completions/mean_terminated_length": 272.78125, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.1012858347967267, + "epoch": 2.01, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11689814925193787, + "learning_rate": 1.3405470052603882e-06, + "loss": -0.0019, + "num_tokens": 8752777.0, + "reward": 12.523555755615234, + "reward_std": 4.099765300750732, + "rewards/fitness_reward/mean": 7.006715774536133, + "rewards/fitness_reward/std": 2.0051794052124023, + "rewards/kidney_reward/mean": 2.3300538063049316, + "rewards/kidney_reward/std": 0.943598747253418, + "rewards/length2tails_reward/mean": 0.8091681003570557, + "rewards/length2tails_reward/std": 0.2313581109046936, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0058703422546387, + "rewards/thermo_reward/std": 1.6812902688980103, + "step": 1005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.9375, + "completions/mean_terminated_length": 272.9375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09508247300982475, + "epoch": 2.012, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11960920691490173, + "learning_rate": 1.3393410881565876e-06, + "loss": -0.0001, + "num_tokens": 8761543.0, + "reward": 12.627235412597656, + "reward_std": 4.367265701293945, + "rewards/fitness_reward/mean": 7.018800258636475, + "rewards/fitness_reward/std": 1.936821460723877, + "rewards/kidney_reward/mean": 2.4195845127105713, + "rewards/kidney_reward/std": 1.0223861932754517, + "rewards/length2tails_reward/mean": 0.8042271137237549, + "rewards/length2tails_reward/std": 0.22055809199810028, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0084283351898193, + "rewards/thermo_reward/std": 1.5836304426193237, + "step": 1006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.15625, + "completions/mean_terminated_length": 271.15625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09306901833042502, + "epoch": 2.014, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.084152452647686, + "learning_rate": 1.3381346130916314e-06, + "loss": -0.006, + "num_tokens": 8770252.0, + "reward": 13.134542465209961, + "reward_std": 3.140427350997925, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.511730670928955, + "rewards/kidney_reward/std": 0.5132253766059875, + "rewards/length2tails_reward/mean": 0.695670485496521, + "rewards/length2tails_reward/std": 0.32462623715400696, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.400190591812134, + "rewards/thermo_reward/std": 1.2556166648864746, + "step": 1007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.3125, + "completions/mean_terminated_length": 272.3125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09930686559528112, + "epoch": 2.016, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10044628381729126, + "learning_rate": 1.336927582049264e-06, + "loss": 0.0009, + "num_tokens": 8778998.0, + "reward": 13.425021171569824, + "reward_std": 1.7134307622909546, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.7415978908538818, + "rewards/length2tails_reward/std": 0.33030208945274353, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.345273494720459, + "rewards/thermo_reward/std": 1.483225703239441, + "step": 1008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.34375, + "completions/mean_terminated_length": 272.34375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08971796184778214, + "epoch": 2.018, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06464128941297531, + "learning_rate": 1.3357199970141454e-06, + "loss": -0.0054, + "num_tokens": 8787745.0, + "reward": 13.278300285339355, + "reward_std": 2.359668731689453, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.4579098224639893, + "rewards/kidney_reward/std": 0.808746337890625, + "rewards/length2tails_reward/mean": 0.7865937948226929, + "rewards/length2tails_reward/std": 0.2422349900007248, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3380556106567383, + "rewards/thermo_reward/std": 1.2652010917663574, + "step": 1009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.5, + "completions/mean_terminated_length": 272.5, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09284854214638472, + "epoch": 2.02, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07684195041656494, + "learning_rate": 1.3345118599718454e-06, + "loss": -0.0006, + "num_tokens": 8796497.0, + "reward": 13.54580307006836, + "reward_std": 1.1847339868545532, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8016017079353333, + "rewards/length2tails_reward/std": 0.1992776095867157, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3779773712158203, + "rewards/thermo_reward/std": 1.1855145692825317, + "step": 1010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09131715074181557, + "epoch": 2.022, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11919866502285004, + "learning_rate": 1.3333031729088417e-06, + "loss": -0.0051, + "num_tokens": 8805232.0, + "reward": 13.67043399810791, + "reward_std": 0.5742598176002502, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7280459403991699, + "rewards/length2tails_reward/std": 0.3318087160587311, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.50996470451355, + "rewards/thermo_reward/std": 0.5615194439888, + "step": 1011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.46875, + "completions/mean_terminated_length": 272.46875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.10009663831442595, + "epoch": 2.024, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08270273357629776, + "learning_rate": 1.3320939378125168e-06, + "loss": -0.0059, + "num_tokens": 8813983.0, + "reward": 13.769775390625, + "reward_std": 0.5637372136116028, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7984813451766968, + "rewards/length2tails_reward/std": 0.25650182366371155, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.0625, + "completions/mean_terminated_length": 273.0625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09755912981927395, + "epoch": 2.026, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09963485598564148, + "learning_rate": 1.3308841566711537e-06, + "loss": -0.0049, + "num_tokens": 8822753.0, + "reward": 13.061192512512207, + "reward_std": 2.7797274589538574, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.3953945636749268, + "rewards/kidney_reward/std": 0.9332201480865479, + "rewards/length2tails_reward/mean": 0.8119730949401855, + "rewards/length2tails_reward/std": 0.2405090183019638, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.180924892425537, + "rewards/thermo_reward/std": 1.625937581062317, + "step": 1013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.10167609713971615, + "epoch": 2.028, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1964738517999649, + "learning_rate": 1.3296738314739338e-06, + "loss": -0.0037, + "num_tokens": 8831497.0, + "reward": 12.929141998291016, + "reward_std": 2.913825035095215, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.3505191802978516, + "rewards/kidney_reward/std": 1.1123260259628296, + "rewards/length2tails_reward/mean": 0.7692917585372925, + "rewards/length2tails_reward/std": 0.2531833350658417, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0980172157287598, + "rewards/thermo_reward/std": 1.6565762758255005, + "step": 1014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09269763343036175, + "epoch": 2.03, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09346351772546768, + "learning_rate": 1.3284629642109324e-06, + "loss": 0.0007, + "num_tokens": 8840245.0, + "reward": 12.098461151123047, + "reward_std": 4.814784049987793, + "rewards/fitness_reward/mean": 7.00302791595459, + "rewards/fitness_reward/std": 2.0260438919067383, + "rewards/kidney_reward/mean": 2.2101261615753174, + "rewards/kidney_reward/std": 1.2059481143951416, + "rewards/length2tails_reward/mean": 0.7802326679229736, + "rewards/length2tails_reward/std": 0.24878345429897308, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.7072839736938477, + "rewards/thermo_reward/std": 2.170975923538208, + "step": 1015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09186808485537767, + "epoch": 2.032, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16771812736988068, + "learning_rate": 1.327251556873117e-06, + "loss": -0.0048, + "num_tokens": 8848980.0, + "reward": 12.41139030456543, + "reward_std": 4.630665302276611, + "rewards/fitness_reward/mean": 6.72658634185791, + "rewards/fitness_reward/std": 2.49751615524292, + "rewards/kidney_reward/mean": 2.326782464981079, + "rewards/kidney_reward/std": 1.0067613124847412, + "rewards/length2tails_reward/mean": 0.7035547494888306, + "rewards/length2tails_reward/std": 0.3279150128364563, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1876654624938965, + "rewards/thermo_reward/std": 1.489618182182312, + "step": 1016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.34375, + "completions/mean_terminated_length": 271.34375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "entropy": 0.10063336603343487, + "epoch": 2.034, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11937062442302704, + "learning_rate": 1.3260396114523417e-06, + "loss": 0.0058, + "num_tokens": 8857695.0, + "reward": 13.733278274536133, + "reward_std": 0.578973114490509, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8323647975921631, + "rewards/length2tails_reward/std": 0.20198071002960205, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 1017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.21875, + "completions/mean_terminated_length": 273.21875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09128071740269661, + "epoch": 2.036, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16607420146465302, + "learning_rate": 1.3248271299413474e-06, + "loss": 0.0021, + "num_tokens": 8866470.0, + "reward": 13.388148307800293, + "reward_std": 1.5452548265457153, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.8022655248641968, + "rewards/length2tails_reward/std": 0.25334814190864563, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3023343086242676, + "rewards/thermo_reward/std": 1.3248815536499023, + "step": 1018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 272.75, + "completions/mean_terminated_length": 272.75, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.10503023117780685, + "epoch": 2.038, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0910627543926239, + "learning_rate": 1.323614114333754e-06, + "loss": 0.0009, + "num_tokens": 8875230.0, + "reward": 13.491312980651855, + "reward_std": 0.6788957118988037, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8058663010597229, + "rewards/length2tails_reward/std": 0.21435952186584473, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3504209518432617, + "rewards/thermo_reward/std": 0.6277978420257568, + "step": 1019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.5625, + "completions/mean_terminated_length": 272.5625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09710283949971199, + "epoch": 2.04, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10189030319452286, + "learning_rate": 1.3224005666240623e-06, + "loss": -0.0023, + "num_tokens": 8883984.0, + "reward": 13.674921035766602, + "reward_std": 0.5684311985969543, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.772911787033081, + "rewards/length2tails_reward/std": 0.2503526210784912, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.50996470451355, + "rewards/thermo_reward/std": 0.5615194439888, + "step": 1020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 274.0, + "completions/mean_terminated_length": 274.0, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.10056890733540058, + "epoch": 2.042, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.121220663189888, + "learning_rate": 1.3211864888076456e-06, + "loss": -0.0001, + "num_tokens": 8892784.0, + "reward": 13.64167594909668, + "reward_std": 0.5900519490242004, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8393223285675049, + "rewards/length2tails_reward/std": 0.24542348086833954, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.470078706741333, + "rewards/thermo_reward/std": 0.5830413699150085, + "step": 1021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 272.59375, + "completions/mean_terminated_length": 272.59375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09844275377690792, + "epoch": 2.044, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09364219754934311, + "learning_rate": 1.31997188288075e-06, + "loss": -0.0015, + "num_tokens": 8901539.0, + "reward": 13.416436195373535, + "reward_std": 1.4693506956100464, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8285934925079346, + "rewards/length2tails_reward/std": 0.2166094034910202, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.300631046295166, + "rewards/thermo_reward/std": 1.3310917615890503, + "step": 1022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.9375, + "completions/mean_terminated_length": 271.9375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09990180004388094, + "epoch": 2.046, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07670339941978455, + "learning_rate": 1.3187567508404898e-06, + "loss": -0.0037, + "num_tokens": 8910273.0, + "reward": 13.480457305908203, + "reward_std": 1.139972448348999, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.761543333530426, + "rewards/length2tails_reward/std": 0.2405269891023636, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.316638231277466, + "rewards/thermo_reward/std": 1.1297534704208374, + "step": 1023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.59375, + "completions/mean_terminated_length": 272.59375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09408196154981852, + "epoch": 2.048, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0961371660232544, + "learning_rate": 1.3175410946848444e-06, + "loss": 0.0005, + "num_tokens": 8919028.0, + "reward": 13.795574188232422, + "reward_std": 0.46939560770988464, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7828723192214966, + "rewards/length2tails_reward/std": 0.2605209946632385, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08972968813031912, + "epoch": 2.05, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10491012036800385, + "learning_rate": 1.3163249164126547e-06, + "loss": -0.0046, + "num_tokens": 8927763.0, + "reward": 13.144769668579102, + "reward_std": 2.383373975753784, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.466433048248291, + "rewards/kidney_reward/std": 0.7614781260490417, + "rewards/length2tails_reward/mean": 0.7181075811386108, + "rewards/length2tails_reward/std": 0.3215291500091553, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2028496265411377, + "rewards/thermo_reward/std": 1.4257335662841797, + "step": 1025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.75, + "completions/mean_terminated_length": 270.75, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "entropy": 0.09596592746675014, + "epoch": 2.052, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31508520245552063, + "learning_rate": 1.3151082180236209e-06, + "loss": -0.0119, + "num_tokens": 8936459.0, + "reward": 13.793975830078125, + "reward_std": 0.46918433904647827, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7668845653533936, + "rewards/length2tails_reward/std": 0.2270083874464035, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08913787268102169, + "epoch": 2.054, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07905755937099457, + "learning_rate": 1.3138910015182968e-06, + "loss": -0.0013, + "num_tokens": 8945207.0, + "reward": 12.620532989501953, + "reward_std": 3.6539318561553955, + "rewards/fitness_reward/mean": 7.188657283782959, + "rewards/fitness_reward/std": 0.7179933190345764, + "rewards/kidney_reward/mean": 2.3732101917266846, + "rewards/kidney_reward/std": 1.041853427886963, + "rewards/length2tails_reward/mean": 0.767655611038208, + "rewards/length2tails_reward/std": 0.29074984788894653, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.8819003105163574, + "rewards/thermo_reward/std": 2.097299575805664, + "step": 1027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.90625, + "completions/mean_terminated_length": 271.90625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10382816102355719, + "epoch": 2.056, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08299266546964645, + "learning_rate": 1.31267326889809e-06, + "loss": -0.0031, + "num_tokens": 8953940.0, + "reward": 13.17679214477539, + "reward_std": 1.736342430114746, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5099523067474365, + "rewards/kidney_reward/std": 0.5228261947631836, + "rewards/length2tails_reward/mean": 0.7341771125793457, + "rewards/length2tails_reward/std": 0.27263689041137695, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.132237434387207, + "rewards/thermo_reward/std": 1.3638969659805298, + "step": 1028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.65625, + "completions/mean_terminated_length": 272.65625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09531169943511486, + "epoch": 2.058, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08293966203927994, + "learning_rate": 1.3114550221652552e-06, + "loss": -0.001, + "num_tokens": 8962697.0, + "reward": 13.612466812133789, + "reward_std": 1.2178254127502441, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7928884029388428, + "rewards/length2tails_reward/std": 0.22859860956668854, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.472871780395508, + "rewards/thermo_reward/std": 1.1119534969329834, + "step": 1029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.53125, + "completions/mean_terminated_length": 273.53125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09332179836928844, + "epoch": 2.06, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1411670595407486, + "learning_rate": 1.3102362633228933e-06, + "loss": 0.0021, + "num_tokens": 8971482.0, + "reward": 13.582618713378906, + "reward_std": 1.0881257057189941, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8324671387672424, + "rewards/length2tails_reward/std": 0.23901960253715515, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4390664100646973, + "rewards/thermo_reward/std": 0.952990710735321, + "step": 1030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.71875, + "completions/mean_terminated_length": 271.71875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08503109263256192, + "epoch": 2.062, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1336006224155426, + "learning_rate": 1.3090169943749473e-06, + "loss": -0.0068, + "num_tokens": 8980209.0, + "reward": 13.105493545532227, + "reward_std": 2.8300795555114746, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.4735350608825684, + "rewards/kidney_reward/std": 0.8651843667030334, + "rewards/length2tails_reward/mean": 0.7215414047241211, + "rewards/length2tails_reward/std": 0.294627845287323, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.156128168106079, + "rewards/thermo_reward/std": 1.8703521490097046, + "step": 1031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.0625, + "completions/mean_terminated_length": 272.0625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0927619244903326, + "epoch": 2.064, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08014991134405136, + "learning_rate": 1.3077972173261983e-06, + "loss": -0.0042, + "num_tokens": 8988947.0, + "reward": 13.46107292175293, + "reward_std": 1.571915864944458, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5046491622924805, + "rewards/kidney_reward/std": 0.5515542030334473, + "rewards/length2tails_reward/mean": 0.7611790895462036, + "rewards/length2tails_reward/std": 0.2673257291316986, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4191207885742188, + "rewards/thermo_reward/std": 1.0504286289215088, + "step": 1032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.40625, + "completions/mean_terminated_length": 272.40625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09465755335986614, + "epoch": 2.066, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07499520480632782, + "learning_rate": 1.3065769341822632e-06, + "loss": -0.0006, + "num_tokens": 8997696.0, + "reward": 13.690796852111816, + "reward_std": 0.8897803425788879, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.767320990562439, + "rewards/length2tails_reward/std": 0.24395054578781128, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.526400089263916, + "rewards/thermo_reward/std": 0.8900278806686401, + "step": 1033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 271.75, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.08990177698433399, + "epoch": 2.068, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11101987957954407, + "learning_rate": 1.305356146949591e-06, + "loss": 0.0041, + "num_tokens": 9006424.0, + "reward": 13.398303985595703, + "reward_std": 1.4064549207687378, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7223544120788574, + "rewards/length2tails_reward/std": 0.29692718386650085, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2931222915649414, + "rewards/thermo_reward/std": 1.2151144742965698, + "step": 1034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.90625, + "completions/mean_terminated_length": 272.90625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09254649002104998, + "epoch": 2.07, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13691101968288422, + "learning_rate": 1.3041348576354594e-06, + "loss": 0.0028, + "num_tokens": 9015189.0, + "reward": 13.770145416259766, + "reward_std": 0.5533500909805298, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8021765947341919, + "rewards/length2tails_reward/std": 0.2569216191768646, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.5, + "completions/mean_terminated_length": 272.5, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09089200105518103, + "epoch": 2.072, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05383050814270973, + "learning_rate": 1.302913068247972e-06, + "loss": -0.0063, + "num_tokens": 9023941.0, + "reward": 12.733110427856445, + "reward_std": 3.2412171363830566, + "rewards/fitness_reward/mean": 6.99554443359375, + "rewards/fitness_reward/std": 1.7628074884414673, + "rewards/kidney_reward/mean": 2.374934673309326, + "rewards/kidney_reward/std": 0.579054057598114, + "rewards/length2tails_reward/mean": 0.7359942197799683, + "rewards/length2tails_reward/std": 0.3207305073738098, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1890320777893066, + "rewards/thermo_reward/std": 1.321785569190979, + "step": 1036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.1875, + "completions/mean_terminated_length": 272.1875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09400006104260683, + "epoch": 2.074, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19333302974700928, + "learning_rate": 1.3016907807960549e-06, + "loss": -0.0012, + "num_tokens": 9032683.0, + "reward": 12.365195274353027, + "reward_std": 6.029626369476318, + "rewards/fitness_reward/mean": 6.920212745666504, + "rewards/fitness_reward/std": 2.4945178031921387, + "rewards/kidney_reward/mean": 2.205441474914551, + "rewards/kidney_reward/std": 1.681620717048645, + "rewards/length2tails_reward/mean": 0.7598259449005127, + "rewards/length2tails_reward/std": 0.2702024579048157, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.063559055328369, + "rewards/thermo_reward/std": 2.0836944580078125, + "step": 1037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.03125, + "completions/mean_terminated_length": 272.03125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.0850873845629394, + "epoch": 2.076, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3471170663833618, + "learning_rate": 1.3004679972894518e-06, + "loss": -0.004, + "num_tokens": 9041420.0, + "reward": 13.284088134765625, + "reward_std": 2.320343255996704, + "rewards/fitness_reward/mean": 7.049720287322998, + "rewards/fitness_reward/std": 1.7619104385375977, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7075710296630859, + "rewards/length2tails_reward/std": 0.2854107916355133, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.464489221572876, + "rewards/thermo_reward/std": 1.0130728483200073, + "step": 1038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.90625, + "completions/mean_terminated_length": 270.90625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09551345184445381, + "epoch": 2.078, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08493397384881973, + "learning_rate": 1.2992447197387238e-06, + "loss": -0.0022, + "num_tokens": 9050121.0, + "reward": 13.317670822143555, + "reward_std": 2.1450092792510986, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.445978879928589, + "rewards/kidney_reward/std": 0.7337779402732849, + "rewards/length2tails_reward/mean": 0.6894269585609436, + "rewards/length2tails_reward/std": 0.3035351037979126, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3415637016296387, + "rewards/thermo_reward/std": 1.474583625793457, + "step": 1039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 274.46875, + "completions/mean_terminated_length": 274.46875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.1033138744533062, + "epoch": 2.08, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09677430987358093, + "learning_rate": 1.2980209501552426e-06, + "loss": -0.0012, + "num_tokens": 9058936.0, + "reward": 13.546756744384766, + "reward_std": 1.086491584777832, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.8765187859535217, + "rewards/length2tails_reward/std": 0.17953187227249146, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.453517198562622, + "rewards/thermo_reward/std": 0.8846840858459473, + "step": 1040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.8125, + "completions/mean_terminated_length": 271.8125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08544276189059019, + "epoch": 2.082, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10554521530866623, + "learning_rate": 1.2967966905511905e-06, + "loss": -0.0064, + "num_tokens": 9067666.0, + "reward": 13.696599960327148, + "reward_std": 1.0239237546920776, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.690746009349823, + "rewards/length2tails_reward/std": 0.35588833689689636, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5672197341918945, + "rewards/thermo_reward/std": 0.8686692714691162, + "step": 1041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.6875, + "completions/mean_terminated_length": 270.6875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09616864379495382, + "epoch": 2.084, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12321001291275024, + "learning_rate": 1.2955719429395546e-06, + "loss": -0.0042, + "num_tokens": 9076360.0, + "reward": 13.158604621887207, + "reward_std": 2.375380516052246, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.486342430114746, + "rewards/kidney_reward/std": 0.651604175567627, + "rewards/length2tails_reward/mean": 0.700831413269043, + "rewards/length2tails_reward/std": 0.3187938928604126, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.140994071960449, + "rewards/thermo_reward/std": 1.8229484558105469, + "step": 1042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.3125, + "completions/mean_terminated_length": 271.3125, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "entropy": 0.10125172417610884, + "epoch": 2.086, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1306227445602417, + "learning_rate": 1.2943467093341244e-06, + "loss": 0.0002, + "num_tokens": 9085074.0, + "reward": 12.651572227478027, + "reward_std": 5.144802093505859, + "rewards/fitness_reward/mean": 6.965117454528809, + "rewards/fitness_reward/std": 2.240497589111328, + "rewards/kidney_reward/mean": 2.4036145210266113, + "rewards/kidney_reward/std": 1.2607145309448242, + "rewards/length2tails_reward/mean": 0.7663254737854004, + "rewards/length2tails_reward/std": 0.2889622747898102, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1062076091766357, + "rewards/thermo_reward/std": 1.9814021587371826, + "step": 1043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 271.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10001961886882782, + "epoch": 2.088, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09242808818817139, + "learning_rate": 1.2931209917494894e-06, + "loss": -0.0029, + "num_tokens": 9093802.0, + "reward": 13.539291381835938, + "reward_std": 1.2714763879776, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7442175149917603, + "rewards/length2tails_reward/std": 0.29208293557167053, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.377204418182373, + "rewards/thermo_reward/std": 1.263572335243225, + "step": 1044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.125, + "completions/mean_terminated_length": 271.125, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.0941868107765913, + "epoch": 2.09, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0928439348936081, + "learning_rate": 1.2918947922010336e-06, + "loss": -0.0035, + "num_tokens": 9102510.0, + "reward": 13.199728012084961, + "reward_std": 2.9908151626586914, + "rewards/fitness_reward/mean": 7.01439094543457, + "rewards/fitness_reward/std": 1.9617626667022705, + "rewards/kidney_reward/mean": 2.4852182865142822, + "rewards/kidney_reward/std": 0.6577827334403992, + "rewards/length2tails_reward/mean": 0.7584496736526489, + "rewards/length2tails_reward/std": 0.2767479121685028, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.524273633956909, + "rewards/thermo_reward/std": 1.296755313873291, + "step": 1045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.25, + "completions/mean_terminated_length": 271.25, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "entropy": 0.09199702274054289, + "epoch": 2.092, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10496368259191513, + "learning_rate": 1.2906681127049338e-06, + "loss": 0.0011, + "num_tokens": 9111222.0, + "reward": 13.468603134155273, + "reward_std": 1.3402777910232544, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7977195382118225, + "rewards/length2tails_reward/std": 0.25168755650520325, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3558850288391113, + "rewards/thermo_reward/std": 1.1745244264602661, + "step": 1046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.78125, + "completions/mean_terminated_length": 273.78125, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.08931320626288652, + "epoch": 2.094, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09162434935569763, + "learning_rate": 1.2894409552781564e-06, + "loss": -0.0006, + "num_tokens": 9120015.0, + "reward": 13.722648620605469, + "reward_std": 0.9620515704154968, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8560950756072998, + "rewards/length2tails_reward/std": 0.16277632117271423, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.549374580383301, + "rewards/thermo_reward/std": 0.9605962038040161, + "step": 1047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.65625, + "completions/mean_terminated_length": 271.65625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10179832112044096, + "epoch": 2.096, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15232229232788086, + "learning_rate": 1.2882133219384538e-06, + "loss": -0.0009, + "num_tokens": 9128740.0, + "reward": 13.005573272705078, + "reward_std": 4.204520225524902, + "rewards/fitness_reward/mean": 7.045802116394043, + "rewards/fitness_reward/std": 1.7840752601623535, + "rewards/kidney_reward/mean": 2.449796438217163, + "rewards/kidney_reward/std": 0.9994713068008423, + "rewards/length2tails_reward/mean": 0.7428468465805054, + "rewards/length2tails_reward/std": 0.2785884141921997, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3356895446777344, + "rewards/thermo_reward/std": 1.4818718433380127, + "step": 1048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.65625, + "completions/mean_terminated_length": 272.65625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09577594231814146, + "epoch": 2.098, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07349243760108948, + "learning_rate": 1.2869852147043605e-06, + "loss": 0.0014, + "num_tokens": 9137497.0, + "reward": 13.581193923950195, + "reward_std": 1.2682756185531616, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7896881103515625, + "rewards/length2tails_reward/std": 0.251890629529953, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4419198036193848, + "rewards/thermo_reward/std": 1.1277644634246826, + "step": 1049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.65625, + "completions/mean_terminated_length": 272.65625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08849729225039482, + "epoch": 2.1, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10994908213615417, + "learning_rate": 1.2857566355951903e-06, + "loss": -0.0046, + "num_tokens": 9146254.0, + "reward": 13.484933853149414, + "reward_std": 1.3967456817626953, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7338998317718506, + "rewards/length2tails_reward/std": 0.3522545397281647, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.35123872756958, + "rewards/thermo_reward/std": 1.2868434190750122, + "step": 1050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.0, + "completions/mean_terminated_length": 273.0, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08763735927641392, + "epoch": 2.102, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08444438874721527, + "learning_rate": 1.2845275866310324e-06, + "loss": -0.0005, + "num_tokens": 9155022.0, + "reward": 12.240044593811035, + "reward_std": 5.094367504119873, + "rewards/fitness_reward/mean": 6.943960189819336, + "rewards/fitness_reward/std": 2.050309658050537, + "rewards/kidney_reward/mean": 2.2489984035491943, + "rewards/kidney_reward/std": 1.3802794218063354, + "rewards/length2tails_reward/mean": 0.7449901103973389, + "rewards/length2tails_reward/std": 0.31968122720718384, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.872586727142334, + "rewards/thermo_reward/std": 2.0441460609436035, + "step": 1051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.125, + "completions/mean_terminated_length": 272.125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09333954565227032, + "epoch": 2.104, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.058305464684963226, + "learning_rate": 1.2832980698327494e-06, + "loss": 0.0052, + "num_tokens": 9163762.0, + "reward": 13.915213584899902, + "reward_std": 0.30910414457321167, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7826879620552063, + "rewards/length2tails_reward/std": 0.2214636653661728, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.0625, + "completions/mean_terminated_length": 272.0625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1027986891567707, + "epoch": 2.106, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10487403720617294, + "learning_rate": 1.2820680872219728e-06, + "loss": 0.002, + "num_tokens": 9172500.0, + "reward": 12.580307960510254, + "reward_std": 2.7758026123046875, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4055867195129395, + "rewards/kidney_reward/std": 0.7715975046157837, + "rewards/length2tails_reward/mean": 0.6917127370834351, + "rewards/length2tails_reward/std": 0.3424265384674072, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.6443653106689453, + "rewards/thermo_reward/std": 2.147477865219116, + "step": 1053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.5, + "completions/mean_terminated_length": 273.5, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09246054664254189, + "epoch": 2.108, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1013147234916687, + "learning_rate": 1.2808376408210994e-06, + "loss": -0.0001, + "num_tokens": 9181284.0, + "reward": 13.547618865966797, + "reward_std": 1.146050214767456, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8119996786117554, + "rewards/length2tails_reward/std": 0.21671175956726074, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.406114101409912, + "rewards/thermo_reward/std": 1.1155322790145874, + "step": 1054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.90625, + "completions/mean_terminated_length": 271.90625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09602517727762461, + "epoch": 2.11, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0790761411190033, + "learning_rate": 1.27960673265329e-06, + "loss": -0.0023, + "num_tokens": 9190017.0, + "reward": 13.315729141235352, + "reward_std": 1.9026858806610107, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.5110602378845215, + "rewards/kidney_reward/std": 0.5168427228927612, + "rewards/length2tails_reward/mean": 0.7272899150848389, + "rewards/length2tails_reward/std": 0.3061966896057129, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.328263759613037, + "rewards/thermo_reward/std": 1.1189709901809692, + "step": 1055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.25, + "completions/mean_terminated_length": 273.25, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09410333167761564, + "epoch": 2.112, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10358601808547974, + "learning_rate": 1.2783753647424632e-06, + "loss": -0.0006, + "num_tokens": 9198793.0, + "reward": 13.800519943237305, + "reward_std": 0.47215381264686584, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8323312401771545, + "rewards/length2tails_reward/std": 0.21438726782798767, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.4375, + "completions/mean_terminated_length": 270.4375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.09133266471326351, + "epoch": 2.114, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12086813151836395, + "learning_rate": 1.2771435391132943e-06, + "loss": -0.0026, + "num_tokens": 9207479.0, + "reward": 13.8716459274292, + "reward_std": 0.38527801632881165, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7458708882331848, + "rewards/length2tails_reward/std": 0.27414339780807495, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 270.65625, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "entropy": 0.10229693353176117, + "epoch": 2.116, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08657363057136536, + "learning_rate": 1.275911257791211e-06, + "loss": -0.0049, + "num_tokens": 9216172.0, + "reward": 12.801111221313477, + "reward_std": 3.3229711055755615, + "rewards/fitness_reward/mean": 6.99554443359375, + "rewards/fitness_reward/std": 1.7628074884414673, + "rewards/kidney_reward/mean": 2.3960742950439453, + "rewards/kidney_reward/std": 0.7102473974227905, + "rewards/length2tails_reward/mean": 0.6917217969894409, + "rewards/length2tails_reward/std": 0.32955726981163025, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2403202056884766, + "rewards/thermo_reward/std": 1.3757102489471436, + "step": 1058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.15625, + "completions/mean_terminated_length": 273.15625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.10404617059975863, + "epoch": 2.118, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0734260156750679, + "learning_rate": 1.2746785228023901e-06, + "loss": -0.0053, + "num_tokens": 9224945.0, + "reward": 13.67131233215332, + "reward_std": 1.030469536781311, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8276180624961853, + "rewards/length2tails_reward/std": 0.2293540984392166, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.500884771347046, + "rewards/thermo_reward/std": 1.0189565420150757, + "step": 1059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.78125, + "completions/mean_terminated_length": 271.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08712383266538382, + "epoch": 2.12, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10176970064640045, + "learning_rate": 1.2734453361737551e-06, + "loss": -0.0046, + "num_tokens": 9233674.0, + "reward": 13.347003936767578, + "reward_std": 1.3631689548492432, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5095105171203613, + "rewards/kidney_reward/std": 0.5252142548561096, + "rewards/length2tails_reward/mean": 0.6960784196853638, + "rewards/length2tails_reward/std": 0.329012930393219, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3067002296447754, + "rewards/thermo_reward/std": 1.1650335788726807, + "step": 1060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.28125, + "completions/mean_terminated_length": 272.28125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0984044810757041, + "epoch": 2.122, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1514298915863037, + "learning_rate": 1.272211699932971e-06, + "loss": -0.0008, + "num_tokens": 9242419.0, + "reward": 12.934194564819336, + "reward_std": 2.576376438140869, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.3321802616119385, + "rewards/kidney_reward/std": 0.9315749406814575, + "rewards/length2tails_reward/mean": 0.7589224576950073, + "rewards/length2tails_reward/std": 0.3022225499153137, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.064936637878418, + "rewards/thermo_reward/std": 1.6775540113449097, + "step": 1061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.90625, + "completions/mean_terminated_length": 270.90625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08853981457650661, + "epoch": 2.124, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19971629977226257, + "learning_rate": 1.2709776161084432e-06, + "loss": -0.0038, + "num_tokens": 9251120.0, + "reward": 12.858133316040039, + "reward_std": 3.0192081928253174, + "rewards/fitness_reward/mean": 7.05157995223999, + "rewards/fitness_reward/std": 1.7513916492462158, + "rewards/kidney_reward/mean": 2.517043113708496, + "rewards/kidney_reward/std": 0.2941751182079315, + "rewards/length2tails_reward/mean": 0.6278231739997864, + "rewards/length2tails_reward/std": 0.35386279225349426, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1267285346984863, + "rewards/thermo_reward/std": 1.473052740097046, + "step": 1062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.09375, + "completions/mean_terminated_length": 272.09375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0893591595813632, + "epoch": 2.126, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06807100772857666, + "learning_rate": 1.2697430867293118e-06, + "loss": -0.0046, + "num_tokens": 9259859.0, + "reward": 12.771037101745605, + "reward_std": 3.42122220993042, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.457012176513672, + "rewards/kidney_reward/std": 0.5447914004325867, + "rewards/length2tails_reward/mean": 0.7093701362609863, + "rewards/length2tails_reward/std": 0.30901870131492615, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.090034246444702, + "rewards/thermo_reward/std": 1.7447394132614136, + "step": 1063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.21875, + "completions/mean_terminated_length": 272.21875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09699106961488724, + "epoch": 2.128, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09156450629234314, + "learning_rate": 1.2685081138254504e-06, + "loss": 0.0015, + "num_tokens": 9268602.0, + "reward": 13.582111358642578, + "reward_std": 1.680040955543518, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5325703620910645, + "rewards/kidney_reward/std": 0.5312304496765137, + "rewards/length2tails_reward/mean": 0.7863272428512573, + "rewards/length2tails_reward/std": 0.2618371248245239, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5097227096557617, + "rewards/thermo_reward/std": 1.1701918840408325, + "step": 1064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.125, + "completions/mean_terminated_length": 272.125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09031611122190952, + "epoch": 2.13, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08338166028261185, + "learning_rate": 1.267272699427462e-06, + "loss": -0.0009, + "num_tokens": 9277342.0, + "reward": 13.708481788635254, + "reward_std": 0.8082547187805176, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7606251835823059, + "rewards/length2tails_reward/std": 0.26824161410331726, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08373354189097881, + "epoch": 2.132, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08093984425067902, + "learning_rate": 1.266036845566675e-06, + "loss": -0.0035, + "num_tokens": 9286100.0, + "reward": 12.996624946594238, + "reward_std": 4.823812961578369, + "rewards/fitness_reward/mean": 7.006075859069824, + "rewards/fitness_reward/std": 2.008800983428955, + "rewards/kidney_reward/mean": 2.3853907585144043, + "rewards/kidney_reward/std": 1.2139407396316528, + "rewards/length2tails_reward/mean": 0.7885721325874329, + "rewards/length2tails_reward/std": 0.2286292165517807, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.426300525665283, + "rewards/thermo_reward/std": 1.6240335702896118, + "step": 1066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.53125, + "completions/mean_terminated_length": 272.53125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09700395073741674, + "epoch": 2.134, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20416933298110962, + "learning_rate": 1.2648005542751405e-06, + "loss": 0.0052, + "num_tokens": 9294853.0, + "reward": 13.65412712097168, + "reward_std": 1.079277753829956, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7824170589447021, + "rewards/length2tails_reward/std": 0.2475566267967224, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.515580177307129, + "rewards/thermo_reward/std": 0.9441563487052917, + "step": 1067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.65625, + "completions/mean_terminated_length": 272.65625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0891966512426734, + "epoch": 2.136, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06771207600831985, + "learning_rate": 1.2635638275856287e-06, + "loss": -0.0061, + "num_tokens": 9303610.0, + "reward": 12.930241584777832, + "reward_std": 2.634490966796875, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4129862785339355, + "rewards/kidney_reward/std": 0.7415686249732971, + "rewards/length2tails_reward/mean": 0.7700674533843994, + "rewards/length2tails_reward/std": 0.28495827317237854, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.979062795639038, + "rewards/thermo_reward/std": 1.927284598350525, + "step": 1068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.03125, + "completions/mean_terminated_length": 271.03125, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "entropy": 0.09346739295870066, + "epoch": 2.138, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20411282777786255, + "learning_rate": 1.2623266675316263e-06, + "loss": -0.0098, + "num_tokens": 9312315.0, + "reward": 13.800832748413086, + "reward_std": 0.47488632798194885, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8354582190513611, + "rewards/length2tails_reward/std": 0.2633455991744995, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.65625, + "completions/mean_terminated_length": 272.65625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0999509422108531, + "epoch": 2.14, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05981971323490143, + "learning_rate": 1.2610890761473315e-06, + "loss": -0.0048, + "num_tokens": 9321072.0, + "reward": 13.200286865234375, + "reward_std": 2.0932772159576416, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4858555793762207, + "rewards/kidney_reward/std": 0.6542784571647644, + "rewards/length2tails_reward/mean": 0.730256199836731, + "rewards/length2tails_reward/std": 0.32691308856010437, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.180220603942871, + "rewards/thermo_reward/std": 1.4679925441741943, + "step": 1070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.625, + "completions/mean_terminated_length": 269.625, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "entropy": 0.09472297970205545, + "epoch": 2.142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09255687892436981, + "learning_rate": 1.2598510554676528e-06, + "loss": -0.0027, + "num_tokens": 9329732.0, + "reward": 13.429327011108398, + "reward_std": 1.4543567895889282, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.711338222026825, + "rewards/length2tails_reward/std": 0.29834526777267456, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2978878021240234, + "rewards/thermo_reward/std": 1.341590166091919, + "step": 1071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.8125, + "completions/mean_terminated_length": 271.8125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09138771425932646, + "epoch": 2.144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0674971267580986, + "learning_rate": 1.2586126075282045e-06, + "loss": -0.0051, + "num_tokens": 9338462.0, + "reward": 13.338953018188477, + "reward_std": 2.1979379653930664, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.466801643371582, + "rewards/kidney_reward/std": 0.7594356536865234, + "rewards/length2tails_reward/mean": 0.7021905183792114, + "rewards/length2tails_reward/std": 0.3625527024269104, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.398256778717041, + "rewards/thermo_reward/std": 1.1553550958633423, + "step": 1072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.40625, + "completions/mean_terminated_length": 272.40625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09341870713979006, + "epoch": 2.146, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10488131642341614, + "learning_rate": 1.2573737343653023e-06, + "loss": -0.0048, + "num_tokens": 9347211.0, + "reward": 13.0523042678833, + "reward_std": 2.726144313812256, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.3864073753356934, + "rewards/kidney_reward/std": 0.9536484479904175, + "rewards/length2tails_reward/mean": 0.7425001859664917, + "rewards/length2tails_reward/std": 0.32825616002082825, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1304609775543213, + "rewards/thermo_reward/std": 1.8517941236495972, + "step": 1073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.90625, + "completions/mean_terminated_length": 272.90625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.10365541558712721, + "epoch": 2.148, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22300338745117188, + "learning_rate": 1.2561344380159627e-06, + "loss": -0.0038, + "num_tokens": 9355976.0, + "reward": 13.19325065612793, + "reward_std": 2.598496913909912, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.4012303352355957, + "rewards/kidney_reward/std": 0.8964017033576965, + "rewards/length2tails_reward/mean": 0.8006282448768616, + "rewards/length2tails_reward/std": 0.25021499395370483, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3082823753356934, + "rewards/thermo_reward/std": 1.4561070203781128, + "step": 1074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.28125, + "completions/mean_terminated_length": 273.28125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08755374420434237, + "epoch": 2.15, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07114322483539581, + "learning_rate": 1.2548947205178962e-06, + "loss": -0.0012, + "num_tokens": 9364753.0, + "reward": 13.618678092956543, + "reward_std": 0.9257692694664001, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8494964838027954, + "rewards/length2tails_reward/std": 0.16474813222885132, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4460642337799072, + "rewards/thermo_reward/std": 0.9196395874023438, + "step": 1075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.28125, + "completions/mean_terminated_length": 272.28125, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "entropy": 0.0993583258241415, + "epoch": 2.152, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07258055359125137, + "learning_rate": 1.2536545839095072e-06, + "loss": -0.0037, + "num_tokens": 9373498.0, + "reward": 12.512422561645508, + "reward_std": 5.39253044128418, + "rewards/fitness_reward/mean": 6.981512546539307, + "rewards/fitness_reward/std": 2.147751569747925, + "rewards/kidney_reward/mean": 2.2829885482788086, + "rewards/kidney_reward/std": 1.4037550687789917, + "rewards/length2tails_reward/mean": 0.7973028421401978, + "rewards/length2tails_reward/std": 0.27729007601737976, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.068190813064575, + "rewards/thermo_reward/std": 2.207812786102295, + "step": 1076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.78125, + "completions/mean_terminated_length": 272.78125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09348468575626612, + "epoch": 2.154, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08929450809955597, + "learning_rate": 1.2524140302298891e-06, + "loss": -0.004, + "num_tokens": 9382259.0, + "reward": 13.80868911743164, + "reward_std": 0.526460587978363, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.788748025894165, + "rewards/length2tails_reward/std": 0.27583274245262146, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.5, + "completions/mean_terminated_length": 271.5, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.0946068437770009, + "epoch": 2.156, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0643715113401413, + "learning_rate": 1.2511730615188204e-06, + "loss": -0.006, + "num_tokens": 9390979.0, + "reward": 12.824613571166992, + "reward_std": 2.739434242248535, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.348367214202881, + "rewards/kidney_reward/std": 0.9928209781646729, + "rewards/length2tails_reward/mean": 0.7526307106018066, + "rewards/length2tails_reward/std": 0.367491751909256, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.939798355102539, + "rewards/thermo_reward/std": 1.7655022144317627, + "step": 1078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.21875, + "completions/mean_terminated_length": 272.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10376318357884884, + "epoch": 2.158, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16339758038520813, + "learning_rate": 1.249931679816762e-06, + "loss": 0.0051, + "num_tokens": 9399722.0, + "reward": 11.997785568237305, + "reward_std": 6.722228527069092, + "rewards/fitness_reward/mean": 6.633974075317383, + "rewards/fitness_reward/std": 2.8792197704315186, + "rewards/kidney_reward/mean": 2.1771368980407715, + "rewards/kidney_reward/std": 1.6707159280776978, + "rewards/length2tails_reward/mean": 0.78455650806427, + "rewards/length2tails_reward/std": 0.2624914050102234, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0082194805145264, + "rewards/thermo_reward/std": 2.290698528289795, + "step": 1079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.84375, + "completions/mean_terminated_length": 270.84375, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "entropy": 0.09403804317116737, + "epoch": 2.16, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09548342227935791, + "learning_rate": 1.2486898871648551e-06, + "loss": -0.0067, + "num_tokens": 9408421.0, + "reward": 13.616037368774414, + "reward_std": 1.0823116302490234, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.761284589767456, + "rewards/length2tails_reward/std": 0.2536487877368927, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.452244281768799, + "rewards/thermo_reward/std": 1.0749342441558838, + "step": 1080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.125, + "completions/mean_terminated_length": 271.125, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.09731936268508434, + "epoch": 2.162, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10964816063642502, + "learning_rate": 1.2474476856049143e-06, + "loss": -0.0017, + "num_tokens": 9417129.0, + "reward": 13.43002986907959, + "reward_std": 1.321260929107666, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7619011402130127, + "rewards/length2tails_reward/std": 0.29861509799957275, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.293534517288208, + "rewards/thermo_reward/std": 1.2125825881958008, + "step": 1081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.15625, + "completions/mean_terminated_length": 273.15625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08726116176694632, + "epoch": 2.164, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07873732596635818, + "learning_rate": 1.2462050771794292e-06, + "loss": -0.0035, + "num_tokens": 9425902.0, + "reward": 13.51996898651123, + "reward_std": 2.503188133239746, + "rewards/fitness_reward/mean": 6.988970756530762, + "rewards/fitness_reward/std": 2.1055612564086914, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8259742259979248, + "rewards/length2tails_reward/std": 0.24308039247989655, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "entropy": 0.09302136953920126, + "epoch": 2.166, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13266915082931519, + "learning_rate": 1.2449620639315567e-06, + "loss": 0.0018, + "num_tokens": 9434637.0, + "reward": 13.589839935302734, + "reward_std": 1.0492033958435059, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8250849843025208, + "rewards/length2tails_reward/std": 0.22939039766788483, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4470252990722656, + "rewards/thermo_reward/std": 0.9150979518890381, + "step": 1083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.71875, + "completions/mean_terminated_length": 272.71875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09248879831284285, + "epoch": 2.168, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1034470871090889, + "learning_rate": 1.2437186479051198e-06, + "loss": -0.0023, + "num_tokens": 9443396.0, + "reward": 13.570369720458984, + "reward_std": 1.1367368698120117, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.5380520820617676, + "rewards/kidney_reward/std": 0.500221848487854, + "rewards/length2tails_reward/mean": 0.7879185676574707, + "rewards/length2tails_reward/std": 0.24495474994182587, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5498504638671875, + "rewards/thermo_reward/std": 0.5360844731330872, + "step": 1084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.59375, + "completions/mean_terminated_length": 271.59375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09990228898823261, + "epoch": 2.17, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1326744556427002, + "learning_rate": 1.2424748311446038e-06, + "loss": -0.0034, + "num_tokens": 9452119.0, + "reward": 13.452388763427734, + "reward_std": 1.3028936386108398, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.715055525302887, + "rewards/length2tails_reward/std": 0.2911849617958069, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.320578098297119, + "rewards/thermo_reward/std": 1.2647727727890015, + "step": 1085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.71875, + "completions/mean_terminated_length": 271.71875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09053017944097519, + "epoch": 2.172, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09604272991418839, + "learning_rate": 1.2412306156951524e-06, + "loss": -0.0044, + "num_tokens": 9460846.0, + "reward": 13.911317825317383, + "reward_std": 0.3229130804538727, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7437255382537842, + "rewards/length2tails_reward/std": 0.24738410115242004, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.28125, + "completions/mean_terminated_length": 271.28125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10133631248027086, + "epoch": 2.174, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.070340096950531, + "learning_rate": 1.2399860036025658e-06, + "loss": -0.0039, + "num_tokens": 9469559.0, + "reward": 12.913904190063477, + "reward_std": 3.856182813644409, + "rewards/fitness_reward/mean": 7.050872802734375, + "rewards/fitness_reward/std": 1.755391001701355, + "rewards/kidney_reward/mean": 2.4534530639648438, + "rewards/kidney_reward/std": 0.8335072994232178, + "rewards/length2tails_reward/mean": 0.6835463047027588, + "rewards/length2tails_reward/std": 0.3237307369709015, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.241224527359009, + "rewards/thermo_reward/std": 1.4365715980529785, + "step": 1087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 270.6875, + "completions/mean_terminated_length": 270.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0889931432902813, + "epoch": 2.176, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06201954558491707, + "learning_rate": 1.2387409969132959e-06, + "loss": -0.0045, + "num_tokens": 9478253.0, + "reward": 13.459275245666504, + "reward_std": 3.005136013031006, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.5390896797180176, + "rewards/kidney_reward/std": 0.49435171484947205, + "rewards/length2tails_reward/mean": 0.7147153615951538, + "rewards/length2tails_reward/std": 0.2847312092781067, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.69566011428833, + "rewards/thermo_reward/std": 0.7545809149742126, + "step": 1088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.5625, + "completions/mean_terminated_length": 272.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09669537469744682, + "epoch": 2.178, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13551564514636993, + "learning_rate": 1.237495597674443e-06, + "loss": 0.001, + "num_tokens": 9487007.0, + "reward": 12.33701229095459, + "reward_std": 4.668380260467529, + "rewards/fitness_reward/mean": 6.98996114730835, + "rewards/fitness_reward/std": 1.793858528137207, + "rewards/kidney_reward/mean": 2.2286503314971924, + "rewards/kidney_reward/std": 1.217890977859497, + "rewards/length2tails_reward/mean": 0.8198975324630737, + "rewards/length2tails_reward/std": 0.24321754276752472, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.936410903930664, + "rewards/thermo_reward/std": 2.0050437450408936, + "step": 1089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.65625, + "completions/mean_terminated_length": 271.65625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09151883330196142, + "epoch": 2.18, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08709075301885605, + "learning_rate": 1.236249807933753e-06, + "loss": -0.0027, + "num_tokens": 9495732.0, + "reward": 12.781303405761719, + "reward_std": 2.283271551132202, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.354283332824707, + "rewards/kidney_reward/std": 0.8840342164039612, + "rewards/length2tails_reward/mean": 0.7024877071380615, + "rewards/length2tails_reward/std": 0.3341656029224396, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.953094720840454, + "rewards/thermo_reward/std": 1.5103843212127686, + "step": 1090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.5, + "completions/mean_terminated_length": 273.5, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08445745427161455, + "epoch": 2.182, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17772817611694336, + "learning_rate": 1.2350036297396152e-06, + "loss": -0.0015, + "num_tokens": 9504516.0, + "reward": 13.544971466064453, + "reward_std": 0.7186257839202881, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8183159232139587, + "rewards/length2tails_reward/std": 0.24614040553569794, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4301929473876953, + "rewards/thermo_reward/std": 0.6010706424713135, + "step": 1091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.40625, + "completions/mean_terminated_length": 271.40625, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.09109118394553661, + "epoch": 2.184, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08223754912614822, + "learning_rate": 1.2337570651410553e-06, + "loss": -0.0019, + "num_tokens": 9513233.0, + "reward": 13.33891773223877, + "reward_std": 3.061128616333008, + "rewards/fitness_reward/mean": 6.987524509429932, + "rewards/fitness_reward/std": 2.113743305206299, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8037495613098145, + "rewards/length2tails_reward/std": 0.26475465297698975, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.571897029876709, + "rewards/thermo_reward/std": 0.844916045665741, + "step": 1092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09510310553014278, + "epoch": 2.186, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.39092686772346497, + "learning_rate": 1.2325101161877363e-06, + "loss": 0.0056, + "num_tokens": 9521991.0, + "reward": 12.502074241638184, + "reward_std": 5.3984293937683105, + "rewards/fitness_reward/mean": 6.960834503173828, + "rewards/fitness_reward/std": 2.26472544670105, + "rewards/kidney_reward/mean": 2.318307399749756, + "rewards/kidney_reward/std": 1.439921259880066, + "rewards/length2tails_reward/mean": 0.8009666204452515, + "rewards/length2tails_reward/std": 0.23079052567481995, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0428361892700195, + "rewards/thermo_reward/std": 1.952774167060852, + "step": 1093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.15625, + "completions/mean_terminated_length": 272.15625, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.10605480056256056, + "epoch": 2.188, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13017325103282928, + "learning_rate": 1.2312627849299522e-06, + "loss": 0.0035, + "num_tokens": 9530732.0, + "reward": 13.25168228149414, + "reward_std": 1.6000559329986572, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.8000182509422302, + "rewards/length2tails_reward/std": 0.24549025297164917, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1660938262939453, + "rewards/thermo_reward/std": 1.3578734397888184, + "step": 1094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.75, + "completions/mean_terminated_length": 272.75, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "entropy": 0.10298249125480652, + "epoch": 2.19, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10652635991573334, + "learning_rate": 1.2300150734186257e-06, + "loss": 0.0029, + "num_tokens": 9539492.0, + "reward": 13.481754302978516, + "reward_std": 1.0265034437179565, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8570235967636108, + "rewards/length2tails_reward/std": 0.2003658562898636, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3357458114624023, + "rewards/thermo_reward/std": 0.9043408632278442, + "step": 1095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.46875, + "completions/mean_terminated_length": 272.46875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.1003633365035057, + "epoch": 2.192, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10922206193208694, + "learning_rate": 1.2287669837053055e-06, + "loss": 0.0032, + "num_tokens": 9548243.0, + "reward": 13.641023635864258, + "reward_std": 1.025151014328003, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7482881546020508, + "rewards/length2tails_reward/std": 0.3050001561641693, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5058889389038086, + "rewards/thermo_reward/std": 0.9933370351791382, + "step": 1096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 285.4375, + "completions/mean_terminated_length": 285.4375, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "entropy": 0.09042511694133282, + "epoch": 2.194, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.114542007446289, + "learning_rate": 1.2275185178421606e-06, + "loss": 0.2213, + "num_tokens": 9557409.0, + "reward": 13.585801124572754, + "reward_std": 1.15471351146698, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8321831226348877, + "rewards/length2tails_reward/std": 0.24415747821331024, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.469636917114258, + "rewards/thermo_reward/std": 0.9873608350753784, + "step": 1097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.84375, + "completions/mean_terminated_length": 273.84375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10123864654451609, + "epoch": 2.196, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09772907197475433, + "learning_rate": 1.2262696778819799e-06, + "loss": -0.0028, + "num_tokens": 9566204.0, + "reward": 13.775348663330078, + "reward_std": 0.5588542222976685, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8542090654373169, + "rewards/length2tails_reward/std": 0.23235513269901276, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.59375, + "completions/mean_terminated_length": 271.59375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08716964721679688, + "epoch": 2.198, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12908770143985748, + "learning_rate": 1.2250204658781673e-06, + "loss": 0.0039, + "num_tokens": 9574927.0, + "reward": 13.350052833557129, + "reward_std": 1.4368737936019897, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.708900511264801, + "rewards/length2tails_reward/std": 0.2893790602684021, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2188572883605957, + "rewards/thermo_reward/std": 1.3603311777114868, + "step": 1099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.3125, + "completions/mean_terminated_length": 273.3125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.10285337548702955, + "epoch": 2.2, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0842437818646431, + "learning_rate": 1.2237708838847373e-06, + "loss": -0.0023, + "num_tokens": 9583705.0, + "reward": 13.880155563354492, + "reward_std": 0.3787178099155426, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8309652805328369, + "rewards/length2tails_reward/std": 0.23853476345539093, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 271.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09618408605456352, + "epoch": 2.202, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14736229181289673, + "learning_rate": 1.2225209339563143e-06, + "loss": -0.0039, + "num_tokens": 9592416.0, + "reward": 13.134501457214355, + "reward_std": 2.643918514251709, + "rewards/fitness_reward/mean": 7.004948616027832, + "rewards/fitness_reward/std": 2.0151779651641846, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7189492583274841, + "rewards/length2tails_reward/std": 0.2612570822238922, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3585379123687744, + "rewards/thermo_reward/std": 1.256617546081543, + "step": 1101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.5, + "completions/mean_terminated_length": 272.5, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09677848406136036, + "epoch": 2.204, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13801175355911255, + "learning_rate": 1.2212706181481266e-06, + "loss": -0.0007, + "num_tokens": 9601168.0, + "reward": 13.477926254272461, + "reward_std": 1.2938647270202637, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7869364619255066, + "rewards/length2tails_reward/std": 0.24135585129261017, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3389272689819336, + "rewards/thermo_reward/std": 1.1878148317337036, + "step": 1102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.9375, + "completions/mean_terminated_length": 271.9375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09158475417643785, + "epoch": 2.206, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25704577565193176, + "learning_rate": 1.2200199385160039e-06, + "loss": -0.0023, + "num_tokens": 9609902.0, + "reward": 12.840763092041016, + "reward_std": 4.182301998138428, + "rewards/fitness_reward/mean": 7.025365829467773, + "rewards/fitness_reward/std": 1.8996788263320923, + "rewards/kidney_reward/mean": 2.440293073654175, + "rewards/kidney_reward/std": 0.9067621827125549, + "rewards/length2tails_reward/mean": 0.73990797996521, + "rewards/length2tails_reward/std": 0.2573379576206207, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.201112747192383, + "rewards/thermo_reward/std": 1.6995855569839478, + "step": 1103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.46875, + "completions/mean_terminated_length": 272.46875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08908624108880758, + "epoch": 2.208, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1501232534646988, + "learning_rate": 1.2187688971163752e-06, + "loss": -0.0052, + "num_tokens": 9618653.0, + "reward": 13.237232208251953, + "reward_std": 3.0937626361846924, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.5390896797180176, + "rewards/kidney_reward/std": 0.49435171484947205, + "rewards/length2tails_reward/mean": 0.7692506313323975, + "rewards/length2tails_reward/std": 0.28168579936027527, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.46816349029541, + "rewards/thermo_reward/std": 1.1310467720031738, + "step": 1104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08702242281287909, + "epoch": 2.21, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0907735675573349, + "learning_rate": 1.2175174960062635e-06, + "loss": -0.0029, + "num_tokens": 9627369.0, + "reward": 13.019981384277344, + "reward_std": 3.489938259124756, + "rewards/fitness_reward/mean": 7.005073547363281, + "rewards/fitness_reward/std": 2.014472723007202, + "rewards/kidney_reward/mean": 2.499762773513794, + "rewards/kidney_reward/std": 0.5781379342079163, + "rewards/length2tails_reward/mean": 0.6592178344726562, + "rewards/length2tails_reward/std": 0.3506201207637787, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3492231369018555, + "rewards/thermo_reward/std": 1.0179513692855835, + "step": 1105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.78125, + "completions/mean_terminated_length": 271.78125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09034999832510948, + "epoch": 2.212, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35235831141471863, + "learning_rate": 1.2162657372432833e-06, + "loss": -0.0027, + "num_tokens": 9636098.0, + "reward": 13.280094146728516, + "reward_std": 2.321930408477783, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.4449310302734375, + "rewards/kidney_reward/std": 0.8809229731559753, + "rewards/length2tails_reward/mean": 0.7409297227859497, + "rewards/length2tails_reward/std": 0.29801851511001587, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3573951721191406, + "rewards/thermo_reward/std": 1.1669204235076904, + "step": 1106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.96875, + "completions/mean_terminated_length": 272.96875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09247970208525658, + "epoch": 2.214, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09246492385864258, + "learning_rate": 1.215013622885638e-06, + "loss": -0.0044, + "num_tokens": 9644865.0, + "reward": 13.759563446044922, + "reward_std": 0.5116779804229736, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8216274976730347, + "rewards/length2tails_reward/std": 0.18907445669174194, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 1107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09686029702425003, + "epoch": 2.216, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10450281202793121, + "learning_rate": 1.2137611549921145e-06, + "loss": 0.0029, + "num_tokens": 9653613.0, + "reward": 13.73473072052002, + "reward_std": 0.8503785729408264, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7688559293746948, + "rewards/length2tails_reward/std": 0.2738942503929138, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.570180654525757, + "rewards/thermo_reward/std": 0.8536139726638794, + "step": 1108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 271.4375, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.09789775125682354, + "epoch": 2.218, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05906645581126213, + "learning_rate": 1.2125083356220816e-06, + "loss": -0.0029, + "num_tokens": 9662331.0, + "reward": 13.155956268310547, + "reward_std": 3.0850579738616943, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.5390896797180176, + "rewards/kidney_reward/std": 0.49435171484947205, + "rewards/length2tails_reward/mean": 0.7530629634857178, + "rewards/length2tails_reward/std": 0.3211040496826172, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.388507127761841, + "rewards/thermo_reward/std": 1.147755742073059, + "step": 1109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.0625, + "completions/mean_terminated_length": 272.0625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09061425412073731, + "epoch": 2.22, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1054484099149704, + "learning_rate": 1.2112551668354861e-06, + "loss": -0.0054, + "num_tokens": 9671069.0, + "reward": 13.273934364318848, + "reward_std": 2.2469191551208496, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.4686717987060547, + "rewards/kidney_reward/std": 0.749081552028656, + "rewards/length2tails_reward/mean": 0.749596118927002, + "rewards/length2tails_reward/std": 0.2696993947029114, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.326627254486084, + "rewards/thermo_reward/std": 1.375333547592163, + "step": 1110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.9375, + "completions/mean_terminated_length": 270.9375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08864146191626787, + "epoch": 2.222, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06858351826667786, + "learning_rate": 1.2100016506928491e-06, + "loss": -0.0034, + "num_tokens": 9679771.0, + "reward": 13.456001281738281, + "reward_std": 1.3390583992004395, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.6750684380531311, + "rewards/length2tails_reward/std": 0.3238985538482666, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.300830364227295, + "rewards/thermo_reward/std": 1.3286960124969482, + "step": 1111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.0936619434505701, + "epoch": 2.224, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1211986243724823, + "learning_rate": 1.2087477892552633e-06, + "loss": -0.0002, + "num_tokens": 9688492.0, + "reward": 13.617203712463379, + "reward_std": 0.6876401305198669, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7429279088973999, + "rewards/length2tails_reward/std": 0.27183738350868225, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5099644660949707, + "rewards/thermo_reward/std": 0.5615194439888, + "step": 1112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.90625, + "completions/mean_terminated_length": 270.90625, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "entropy": 0.10044244676828384, + "epoch": 2.226, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11018425226211548, + "learning_rate": 1.2074935845843885e-06, + "loss": 0.0028, + "num_tokens": 9697193.0, + "reward": 13.161798477172852, + "reward_std": 2.0913569927215576, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4495739936828613, + "rewards/kidney_reward/std": 0.5821607708930969, + "rewards/length2tails_reward/mean": 0.7702677249908447, + "rewards/length2tails_reward/std": 0.27362877130508423, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.174013614654541, + "rewards/thermo_reward/std": 1.5860369205474854, + "step": 1113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 274.125, + "completions/mean_terminated_length": 274.125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09556818194687366, + "epoch": 2.228, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8129578828811646, + "learning_rate": 1.206239038742449e-06, + "loss": -0.0042, + "num_tokens": 9705997.0, + "reward": 13.08530044555664, + "reward_std": 2.681222915649414, + "rewards/fitness_reward/mean": 7.051130294799805, + "rewards/fitness_reward/std": 1.753933072090149, + "rewards/kidney_reward/mean": 2.456324815750122, + "rewards/kidney_reward/std": 0.5482165813446045, + "rewards/length2tails_reward/mean": 0.8678853511810303, + "rewards/length2tails_reward/std": 0.2075439840555191, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3910574913024902, + "rewards/thermo_reward/std": 1.0013716220855713, + "step": 1114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.0, + "completions/mean_terminated_length": 272.0, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09249386843293905, + "epoch": 2.23, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10124978423118591, + "learning_rate": 1.2049841537922305e-06, + "loss": -0.0076, + "num_tokens": 9714733.0, + "reward": 12.888282775878906, + "reward_std": 2.722522735595703, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.485222816467285, + "rewards/kidney_reward/std": 0.6577568650245667, + "rewards/length2tails_reward/mean": 0.741077184677124, + "rewards/length2tails_reward/std": 0.33156618475914, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.925276517868042, + "rewards/thermo_reward/std": 2.0026774406433105, + "step": 1115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.5625, + "completions/mean_terminated_length": 271.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09432558994740248, + "epoch": 2.232, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1653272807598114, + "learning_rate": 1.2037289317970756e-06, + "loss": 0.0041, + "num_tokens": 9723455.0, + "reward": 11.966596603393555, + "reward_std": 6.564814567565918, + "rewards/fitness_reward/mean": 6.64857292175293, + "rewards/fitness_reward/std": 2.827414035797119, + "rewards/kidney_reward/mean": 2.1993813514709473, + "rewards/kidney_reward/std": 1.6382735967636108, + "rewards/length2tails_reward/mean": 0.7423949241638184, + "rewards/length2tails_reward/std": 0.2909180223941803, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9444031715393066, + "rewards/thermo_reward/std": 2.245339870452881, + "step": 1116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.09375, + "completions/mean_terminated_length": 273.09375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09407586511224508, + "epoch": 2.234, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21728630363941193, + "learning_rate": 1.2024733748208818e-06, + "loss": -0.002, + "num_tokens": 9732226.0, + "reward": 13.806081771850586, + "reward_std": 0.5215768218040466, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7626720070838928, + "rewards/length2tails_reward/std": 0.3082931637763977, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 274.03125, + "completions/mean_terminated_length": 274.03125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.10200390871614218, + "epoch": 2.2359999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12265169620513916, + "learning_rate": 1.201217484928097e-06, + "loss": -0.0023, + "num_tokens": 9741027.0, + "reward": 13.404191970825195, + "reward_std": 2.496299982070923, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.453716278076172, + "rewards/kidney_reward/std": 0.9772971272468567, + "rewards/length2tails_reward/mean": 0.8660367727279663, + "rewards/length2tails_reward/std": 0.19639356434345245, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.402686834335327, + "rewards/thermo_reward/std": 1.5408576726913452, + "step": 1118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.15625, + "completions/mean_terminated_length": 272.15625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09110501315444708, + "epoch": 2.238, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0856006145477295, + "learning_rate": 1.1999612641837166e-06, + "loss": -0.0, + "num_tokens": 9749768.0, + "reward": 13.267995834350586, + "reward_std": 1.4361052513122559, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7791933417320251, + "rewards/length2tails_reward/std": 0.24449069797992706, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1297717094421387, + "rewards/thermo_reward/std": 1.3451257944107056, + "step": 1119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.03125, + "completions/mean_terminated_length": 272.03125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08734600804746151, + "epoch": 2.24, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08932296186685562, + "learning_rate": 1.1987047146532799e-06, + "loss": -0.0069, + "num_tokens": 9758505.0, + "reward": 13.060445785522461, + "reward_std": 2.2628417015075684, + "rewards/fitness_reward/mean": 7.188657760620117, + "rewards/fitness_reward/std": 0.7179933190345764, + "rewards/kidney_reward/mean": 2.508427143096924, + "rewards/kidney_reward/std": 0.5310735702514648, + "rewards/length2tails_reward/mean": 0.722597599029541, + "rewards/length2tails_reward/std": 0.30088114738464355, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1911020278930664, + "rewards/thermo_reward/std": 1.3022575378417969, + "step": 1120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.34375, + "completions/mean_terminated_length": 273.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10040435381233692, + "epoch": 2.242, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15858429670333862, + "learning_rate": 1.197447838402867e-06, + "loss": 0.0067, + "num_tokens": 9767284.0, + "reward": 13.367097854614258, + "reward_std": 1.0577739477157593, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7829563021659851, + "rewards/length2tails_reward/std": 0.3015859127044678, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2558560371398926, + "rewards/thermo_reward/std": 0.9143465757369995, + "step": 1121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.09375, + "completions/mean_terminated_length": 273.09375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09452992957085371, + "epoch": 2.2439999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07336730509996414, + "learning_rate": 1.196190637499095e-06, + "loss": -0.0041, + "num_tokens": 9776055.0, + "reward": 13.433874130249023, + "reward_std": 1.4296433925628662, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4767003059387207, + "rewards/kidney_reward/std": 0.5699067711830139, + "rewards/length2tails_reward/mean": 0.8327226638793945, + "rewards/length2tails_reward/std": 0.23930272459983826, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.412717342376709, + "rewards/thermo_reward/std": 0.9000661373138428, + "step": 1122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.40625, + "completions/mean_terminated_length": 273.40625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09509277995675802, + "epoch": 2.246, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1498255878686905, + "learning_rate": 1.1949331140091152e-06, + "loss": -0.0019, + "num_tokens": 9784836.0, + "reward": 12.87520980834961, + "reward_std": 2.7726833820343018, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.452592670917511, + "rewards/kidney_reward/mean": 2.4142074584960938, + "rewards/kidney_reward/std": 0.7665389180183411, + "rewards/length2tails_reward/mean": 0.8063620328903198, + "rewards/length2tails_reward/std": 0.2866155505180359, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0341992378234863, + "rewards/thermo_reward/std": 1.8778098821640015, + "step": 1123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.625, + "completions/mean_terminated_length": 272.625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09982185065746307, + "epoch": 2.248, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21547384560108185, + "learning_rate": 1.1936752700006086e-06, + "loss": 0.0022, + "num_tokens": 9793592.0, + "reward": 13.266108512878418, + "reward_std": 1.8420783281326294, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.4757165908813477, + "rewards/kidney_reward/std": 0.5750632286071777, + "rewards/length2tails_reward/mean": 0.7889871001243591, + "rewards/length2tails_reward/std": 0.2755734622478485, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3078179359436035, + "rewards/thermo_reward/std": 1.0306305885314941, + "step": 1124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.03125, + "completions/mean_terminated_length": 273.03125, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.09938167594373226, + "epoch": 2.25, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10929016023874283, + "learning_rate": 1.1924171075417836e-06, + "loss": 0.0005, + "num_tokens": 9802361.0, + "reward": 13.469961166381836, + "reward_std": 1.7545742988586426, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4885220527648926, + "rewards/kidney_reward/std": 0.6396322250366211, + "rewards/length2tails_reward/mean": 0.8194814324378967, + "rewards/length2tails_reward/std": 0.20707230269908905, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.438304901123047, + "rewards/thermo_reward/std": 1.1463873386383057, + "step": 1125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.5, + "completions/mean_terminated_length": 272.5, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09620609972625971, + "epoch": 2.252, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10473023355007172, + "learning_rate": 1.1911586287013725e-06, + "loss": -0.002, + "num_tokens": 9811113.0, + "reward": 12.45807933807373, + "reward_std": 4.429269313812256, + "rewards/fitness_reward/mean": 7.04914665222168, + "rewards/fitness_reward/std": 1.7651554346084595, + "rewards/kidney_reward/mean": 2.2511706352233887, + "rewards/kidney_reward/std": 1.2130392789840698, + "rewards/length2tails_reward/mean": 0.7860097885131836, + "rewards/length2tails_reward/std": 0.25211507081985474, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.979160785675049, + "rewards/thermo_reward/std": 2.074921131134033, + "step": 1126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.28125, + "completions/mean_terminated_length": 273.28125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09645435586571693, + "epoch": 2.254, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09141551703214645, + "learning_rate": 1.1898998355486272e-06, + "loss": -0.0055, + "num_tokens": 9819890.0, + "reward": 13.026348114013672, + "reward_std": 4.3629889488220215, + "rewards/fitness_reward/mean": 7.019981384277344, + "rewards/fitness_reward/std": 1.930139183998108, + "rewards/kidney_reward/mean": 2.436849355697632, + "rewards/kidney_reward/std": 1.0727108716964722, + "rewards/length2tails_reward/mean": 0.7998782396316528, + "rewards/length2tails_reward/std": 0.2706691324710846, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.389529228210449, + "rewards/thermo_reward/std": 1.4026683568954468, + "step": 1127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.96875, + "completions/mean_terminated_length": 272.96875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09469268657267094, + "epoch": 2.2560000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08354964852333069, + "learning_rate": 1.188640730153317e-06, + "loss": 0.0018, + "num_tokens": 9828657.0, + "reward": 13.520135879516602, + "reward_std": 1.2767798900604248, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8036321401596069, + "rewards/length2tails_reward/std": 0.24694252014160156, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4068260192871094, + "rewards/thermo_reward/std": 1.1119424104690552, + "step": 1128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1023802924901247, + "epoch": 2.258, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13477559387683868, + "learning_rate": 1.1873813145857248e-06, + "loss": 0.0004, + "num_tokens": 9837401.0, + "reward": 13.01758861541748, + "reward_std": 2.709385633468628, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.45963191986084, + "rewards/kidney_reward/std": 0.6603246331214905, + "rewards/length2tails_reward/mean": 0.7318527698516846, + "rewards/length2tails_reward/std": 0.2912712097167969, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.081094741821289, + "rewards/thermo_reward/std": 1.8870495557785034, + "step": 1129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.46875, + "completions/mean_terminated_length": 273.46875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09059114195406437, + "epoch": 2.26, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13202516734600067, + "learning_rate": 1.186121590916642e-06, + "loss": -0.005, + "num_tokens": 9846184.0, + "reward": 13.627695083618164, + "reward_std": 1.1517388820648193, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.5113868713378906, + "rewards/kidney_reward/std": 0.5150805115699768, + "rewards/length2tails_reward/mean": 0.8301060199737549, + "rewards/length2tails_reward/std": 0.23349504172801971, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08616469334810972, + "epoch": 2.262, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07374259829521179, + "learning_rate": 1.1848615612173686e-06, + "loss": -0.0039, + "num_tokens": 9854915.0, + "reward": 13.142324447631836, + "reward_std": 3.076014280319214, + "rewards/fitness_reward/mean": 7.0498809814453125, + "rewards/fitness_reward/std": 1.761002540588379, + "rewards/kidney_reward/mean": 2.5088601112365723, + "rewards/kidney_reward/std": 0.5287303328514099, + "rewards/length2tails_reward/mean": 0.730111837387085, + "rewards/length2tails_reward/std": 0.300480455160141, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.410572052001953, + "rewards/thermo_reward/std": 0.9098656177520752, + "step": 1131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 271.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0960354171693325, + "epoch": 2.2640000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21694089472293854, + "learning_rate": 1.1836012275597065e-06, + "loss": 0.0005, + "num_tokens": 9863643.0, + "reward": 12.541905403137207, + "reward_std": 4.093541622161865, + "rewards/fitness_reward/mean": 7.038051605224609, + "rewards/fitness_reward/std": 1.8279199600219727, + "rewards/kidney_reward/mean": 2.418659210205078, + "rewards/kidney_reward/std": 0.8826969265937805, + "rewards/length2tails_reward/mean": 0.7205191850662231, + "rewards/length2tails_reward/std": 0.29001757502555847, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9131431579589844, + "rewards/thermo_reward/std": 1.8318647146224976, + "step": 1132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.28125, + "completions/mean_terminated_length": 272.28125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09206268656998873, + "epoch": 2.266, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25878921151161194, + "learning_rate": 1.1823405920159574e-06, + "loss": -0.0009, + "num_tokens": 9872388.0, + "reward": 13.484138488769531, + "reward_std": 1.8616029024124146, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5085091590881348, + "rewards/kidney_reward/std": 0.5306293368339539, + "rewards/length2tails_reward/mean": 0.7648021578788757, + "rewards/length2tails_reward/std": 0.28547853231430054, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4379642009735107, + "rewards/thermo_reward/std": 1.3502850532531738, + "step": 1133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.875, + "completions/mean_terminated_length": 271.875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09104644972831011, + "epoch": 2.268, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12931600213050842, + "learning_rate": 1.1810796566589206e-06, + "loss": -0.0011, + "num_tokens": 9881120.0, + "reward": 13.61038589477539, + "reward_std": 1.942013144493103, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5073652267456055, + "rewards/kidney_reward/std": 0.673812985420227, + "rewards/length2tails_reward/mean": 0.7605490684509277, + "rewards/length2tails_reward/std": 0.29428207874298096, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5657811164855957, + "rewards/thermo_reward/std": 1.2764577865600586, + "step": 1134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 268.5, + "completions/mean_terminated_length": 268.5, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.0982245123013854, + "epoch": 2.27, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6288898587226868, + "learning_rate": 1.1798184235618866e-06, + "loss": -0.0922, + "num_tokens": 9889744.0, + "reward": 12.440940856933594, + "reward_std": 5.082667827606201, + "rewards/fitness_reward/mean": 6.977675437927246, + "rewards/fitness_reward/std": 2.1694583892822266, + "rewards/kidney_reward/mean": 2.3234381675720215, + "rewards/kidney_reward/std": 1.2618170976638794, + "rewards/length2tails_reward/mean": 0.8122677803039551, + "rewards/length2tails_reward/std": 0.28319400548934937, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9586005210876465, + "rewards/thermo_reward/std": 2.035763740539551, + "step": 1135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 266.0, + "completions/mean_terminated_length": 266.0, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.09654929209500551, + "epoch": 2.2720000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8640089631080627, + "learning_rate": 1.1785568947986366e-06, + "loss": -0.1303, + "num_tokens": 9898288.0, + "reward": 12.790825843811035, + "reward_std": 4.592556953430176, + "rewards/fitness_reward/mean": 6.991696357727051, + "rewards/fitness_reward/std": 2.0901432037353516, + "rewards/kidney_reward/mean": 2.3974647521972656, + "rewards/kidney_reward/std": 1.146229863166809, + "rewards/length2tails_reward/mean": 0.7754591703414917, + "rewards/length2tails_reward/std": 0.2671588361263275, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.224119186401367, + "rewards/thermo_reward/std": 1.446903944015503, + "step": 1136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.9375, + "completions/mean_terminated_length": 272.9375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09294535405933857, + "epoch": 2.274, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12001582980155945, + "learning_rate": 1.177295072443438e-06, + "loss": -0.005, + "num_tokens": 9907054.0, + "reward": 13.032320022583008, + "reward_std": 3.873511791229248, + "rewards/fitness_reward/mean": 7.047097682952881, + "rewards/fitness_reward/std": 1.7767457962036133, + "rewards/kidney_reward/mean": 2.4707627296447754, + "rewards/kidney_reward/std": 0.8808674216270447, + "rewards/length2tails_reward/mean": 0.8012950420379639, + "rewards/length2tails_reward/std": 0.25205302238464355, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.334329843521118, + "rewards/thermo_reward/std": 1.284340739250183, + "step": 1137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 274.25, + "completions/mean_terminated_length": 274.25, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.09910607803612947, + "epoch": 2.276, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1186261773109436, + "learning_rate": 1.1760329585710397e-06, + "loss": -0.0014, + "num_tokens": 9915862.0, + "reward": 12.83854866027832, + "reward_std": 4.40946626663208, + "rewards/fitness_reward/mean": 7.0206618309021, + "rewards/fitness_reward/std": 1.9262902736663818, + "rewards/kidney_reward/mean": 2.446737766265869, + "rewards/kidney_reward/std": 1.0167741775512695, + "rewards/length2tails_reward/mean": 0.8771859407424927, + "rewards/length2tails_reward/std": 0.15988872945308685, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1834309101104736, + "rewards/thermo_reward/std": 1.6658512353897095, + "step": 1138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09410636872053146, + "epoch": 2.278, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08248171210289001, + "learning_rate": 1.1747705552566717e-06, + "loss": -0.0034, + "num_tokens": 9924593.0, + "reward": 13.137075424194336, + "reward_std": 2.7558913230895996, + "rewards/fitness_reward/mean": 6.993189334869385, + "rewards/fitness_reward/std": 2.081698179244995, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.7481805086135864, + "rewards/length2tails_reward/std": 0.2986763119697571, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4246654510498047, + "rewards/thermo_reward/std": 1.154456615447998, + "step": 1139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 271.75, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08954431302845478, + "epoch": 2.2800000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.098535917699337, + "learning_rate": 1.173507864576039e-06, + "loss": 0.0059, + "num_tokens": 9933321.0, + "reward": 13.397356033325195, + "reward_std": 1.8085975646972656, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.5208089351654053, + "rewards/kidney_reward/std": 0.5977639555931091, + "rewards/length2tails_reward/mean": 0.7480281591415405, + "rewards/length2tails_reward/std": 0.2698073089122772, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.398068428039551, + "rewards/thermo_reward/std": 0.9680426120758057, + "step": 1140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 272.9375, + "completions/mean_terminated_length": 272.9375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.10984822269529104, + "epoch": 2.282, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08918312191963196, + "learning_rate": 1.172244888605319e-06, + "loss": -0.0014, + "num_tokens": 9942087.0, + "reward": 13.5318603515625, + "reward_std": 1.0554815530776978, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8391514420509338, + "rewards/length2tails_reward/std": 0.19658353924751282, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.387639045715332, + "rewards/thermo_reward/std": 1.0177863836288452, + "step": 1141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.5625, + "completions/mean_terminated_length": 273.5625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09403904061764479, + "epoch": 2.284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08600325882434845, + "learning_rate": 1.1709816294211582e-06, + "loss": -0.0041, + "num_tokens": 9950873.0, + "reward": 13.683900833129883, + "reward_std": 1.0784289836883545, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8729538917541504, + "rewards/length2tails_reward/std": 0.17817066609859467, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.563659429550171, + "rewards/thermo_reward/std": 0.8868530988693237, + "step": 1142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.78125, + "completions/mean_terminated_length": 271.78125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09034307207912207, + "epoch": 2.286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09123056381940842, + "learning_rate": 1.1697180891006689e-06, + "loss": -0.002, + "num_tokens": 9959602.0, + "reward": 13.693422317504883, + "reward_std": 0.8599316477775574, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7169778347015381, + "rewards/length2tails_reward/std": 0.33215200901031494, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5340588092803955, + "rewards/thermo_reward/std": 0.8522934317588806, + "step": 1143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09334644302725792, + "epoch": 2.288, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09386125952005386, + "learning_rate": 1.168454269721426e-06, + "loss": -0.0031, + "num_tokens": 9968323.0, + "reward": 13.636048316955566, + "reward_std": 1.0688204765319824, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7532642483711243, + "rewards/length2tails_reward/std": 0.25923576951026917, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5277750492095947, + "rewards/thermo_reward/std": 0.883216381072998, + "step": 1144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.53125, + "completions/mean_terminated_length": 273.53125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10313539113849401, + "epoch": 2.29, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08303596824407578, + "learning_rate": 1.1671901733614627e-06, + "loss": -0.0061, + "num_tokens": 9977108.0, + "reward": 13.582897186279297, + "reward_std": 1.599317193031311, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.51045298576355, + "rewards/kidney_reward/std": 0.5201210975646973, + "rewards/length2tails_reward/mean": 0.8479992151260376, + "rewards/length2tails_reward/std": 0.20036792755126953, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.526458740234375, + "rewards/thermo_reward/std": 1.0810075998306274, + "step": 1145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.125, + "completions/mean_terminated_length": 272.125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09446029318496585, + "epoch": 2.292, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12472908198833466, + "learning_rate": 1.165925802099268e-06, + "loss": -0.0012, + "num_tokens": 9985848.0, + "reward": 12.099523544311523, + "reward_std": 5.365253448486328, + "rewards/fitness_reward/mean": 6.966158390045166, + "rewards/fitness_reward/std": 2.234607696533203, + "rewards/kidney_reward/mean": 2.1472251415252686, + "rewards/kidney_reward/std": 1.4632967710494995, + "rewards/length2tails_reward/mean": 0.7171519994735718, + "rewards/length2tails_reward/std": 0.31994375586509705, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.814424514770508, + "rewards/thermo_reward/std": 2.0936083793640137, + "step": 1146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 270.65625, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "entropy": 0.09268662519752979, + "epoch": 2.294, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09646332263946533, + "learning_rate": 1.1646611580137823e-06, + "loss": -0.0064, + "num_tokens": 9994541.0, + "reward": 13.49638557434082, + "reward_std": 1.167270302772522, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.511590003967285, + "rewards/kidney_reward/std": 0.5139835476875305, + "rewards/length2tails_reward/mean": 0.7115417718887329, + "rewards/length2tails_reward/std": 0.32849302887916565, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.50996470451355, + "rewards/thermo_reward/std": 0.5615194439888, + "step": 1147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.15625, + "completions/mean_terminated_length": 273.15625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09854295663535595, + "epoch": 2.296, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10047957301139832, + "learning_rate": 1.1633962431843955e-06, + "loss": -0.0029, + "num_tokens": 10003314.0, + "reward": 13.771848678588867, + "reward_std": 0.5571531653404236, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8192107677459717, + "rewards/length2tails_reward/std": 0.23006263375282288, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.6296226978302, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.0625, + "completions/mean_terminated_length": 273.0625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08827311685308814, + "epoch": 2.298, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1572035402059555, + "learning_rate": 1.1621310596909421e-06, + "loss": 0.0043, + "num_tokens": 10012084.0, + "reward": 13.377315521240234, + "reward_std": 2.1593408584594727, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.501345157623291, + "rewards/kidney_reward/std": 0.7078666090965271, + "rewards/length2tails_reward/mean": 0.7940536737442017, + "rewards/length2tails_reward/std": 0.29090404510498047, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3928890228271484, + "rewards/thermo_reward/std": 1.1827491521835327, + "step": 1149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.65625, + "completions/mean_terminated_length": 271.65625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08063013199716806, + "epoch": 2.3, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11249648779630661, + "learning_rate": 1.1608656096136983e-06, + "loss": 0.0014, + "num_tokens": 10020809.0, + "reward": 12.985828399658203, + "reward_std": 4.395967483520508, + "rewards/fitness_reward/mean": 7.006161212921143, + "rewards/fitness_reward/std": 2.008317708969116, + "rewards/kidney_reward/mean": 2.414630889892578, + "rewards/kidney_reward/std": 1.0500935316085815, + "rewards/length2tails_reward/mean": 0.7493383884429932, + "rewards/length2tails_reward/std": 0.27332738041877747, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3901023864746094, + "rewards/thermo_reward/std": 1.3996142148971558, + "step": 1150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.96875, + "completions/mean_terminated_length": 272.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09177826624363661, + "epoch": 2.302, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19721175730228424, + "learning_rate": 1.1595998950333793e-06, + "loss": 0.0036, + "num_tokens": 10029576.0, + "reward": 13.098941802978516, + "reward_std": 3.715373992919922, + "rewards/fitness_reward/mean": 7.051706314086914, + "rewards/fitness_reward/std": 1.7506755590438843, + "rewards/kidney_reward/mean": 2.480517625808716, + "rewards/kidney_reward/std": 0.8256861567497253, + "rewards/length2tails_reward/mean": 0.8044684529304504, + "rewards/length2tails_reward/std": 0.25673922896385193, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3862717151641846, + "rewards/thermo_reward/std": 1.2167147397994995, + "step": 1151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.90625, + "completions/mean_terminated_length": 273.90625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08292252477258444, + "epoch": 2.304, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14803364872932434, + "learning_rate": 1.158333918031134e-06, + "loss": -0.0003, + "num_tokens": 10038373.0, + "reward": 12.912191390991211, + "reward_std": 4.401137351989746, + "rewards/fitness_reward/mean": 7.018555164337158, + "rewards/fitness_reward/std": 1.938206672668457, + "rewards/kidney_reward/mean": 2.420194625854492, + "rewards/kidney_reward/std": 1.0189740657806396, + "rewards/length2tails_reward/mean": 0.8237855434417725, + "rewards/length2tails_reward/std": 0.27030956745147705, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.291062593460083, + "rewards/thermo_reward/std": 1.5103964805603027, + "step": 1152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.6875, + "completions/mean_terminated_length": 273.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08876392152160406, + "epoch": 2.306, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10971035063266754, + "learning_rate": 1.157067680688544e-06, + "loss": 0.003, + "num_tokens": 10047163.0, + "reward": 13.720508575439453, + "reward_std": 0.5333245396614075, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.829923152923584, + "rewards/length2tails_reward/std": 0.23732823133468628, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5498507022857666, + "rewards/thermo_reward/std": 0.5360844731330872, + "step": 1153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.34375, + "completions/mean_terminated_length": 271.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09525459352880716, + "epoch": 2.308, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0928049087524414, + "learning_rate": 1.1558011850876181e-06, + "loss": -0.0003, + "num_tokens": 10055878.0, + "reward": 13.224510192871094, + "reward_std": 1.984014630317688, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5322229862213135, + "rewards/kidney_reward/std": 0.5331960916519165, + "rewards/length2tails_reward/mean": 0.7088586688041687, + "rewards/length2tails_reward/std": 0.3383852541446686, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.160215377807617, + "rewards/thermo_reward/std": 1.6182459592819214, + "step": 1154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.34375, + "completions/mean_terminated_length": 270.34375, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "entropy": 0.09121548756957054, + "epoch": 2.31, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08748360723257065, + "learning_rate": 1.1545344333107904e-06, + "loss": -0.0005, + "num_tokens": 10064561.0, + "reward": 13.696756362915039, + "reward_std": 0.868507444858551, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7743874788284302, + "rewards/length2tails_reward/std": 0.262215256690979, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5316529273986816, + "rewards/thermo_reward/std": 0.8640903234481812, + "step": 1155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 273.90625, + "completions/mean_terminated_length": 273.90625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.092183624394238, + "epoch": 2.312, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14290811121463776, + "learning_rate": 1.1532674274409157e-06, + "loss": 0.0001, + "num_tokens": 10073358.0, + "reward": 13.178348541259766, + "reward_std": 1.9608882665634155, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4581027030944824, + "rewards/kidney_reward/std": 0.6685106158256531, + "rewards/length2tails_reward/mean": 0.8021647930145264, + "rewards/length2tails_reward/std": 0.2939205467700958, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1788454055786133, + "rewards/thermo_reward/std": 1.3569666147232056, + "step": 1156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.78125, + "completions/mean_terminated_length": 273.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09328475221991539, + "epoch": 2.314, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07290919125080109, + "learning_rate": 1.1520001695612673e-06, + "loss": -0.0031, + "num_tokens": 10082151.0, + "reward": 13.920624732971191, + "reward_std": 0.3185406029224396, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8367986679077148, + "rewards/length2tails_reward/std": 0.23183605074882507, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09928023163229227, + "epoch": 2.316, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14347343146800995, + "learning_rate": 1.1507326617555312e-06, + "loss": 0.0055, + "num_tokens": 10090872.0, + "reward": 13.569886207580566, + "reward_std": 1.0947325229644775, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7327724695205688, + "rewards/length2tails_reward/std": 0.2740943431854248, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4363033771514893, + "rewards/thermo_reward/std": 0.9662889838218689, + "step": 1158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.96875, + "completions/mean_terminated_length": 272.96875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08951482083648443, + "epoch": 2.318, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11145684123039246, + "learning_rate": 1.1494649061078069e-06, + "loss": 0.0015, + "num_tokens": 10099639.0, + "reward": 13.879165649414062, + "reward_std": 0.37595221400260925, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8210693597793579, + "rewards/length2tails_reward/std": 0.2428348809480667, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.4375, + "completions/mean_terminated_length": 272.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09583273157477379, + "epoch": 2.32, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11117927730083466, + "learning_rate": 1.1481969047025993e-06, + "loss": -0.0009, + "num_tokens": 10108389.0, + "reward": 13.457784652709961, + "reward_std": 1.157409429550171, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.787238359451294, + "rewards/length2tails_reward/std": 0.26374247670173645, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3187546730041504, + "rewards/thermo_reward/std": 1.122248888015747, + "step": 1160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.65625, + "completions/mean_terminated_length": 271.65625, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 0.08784770919010043, + "epoch": 2.322, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14624707400798798, + "learning_rate": 1.146928659624818e-06, + "loss": -0.0034, + "num_tokens": 10117114.0, + "reward": 13.683073997497559, + "reward_std": 0.8640279173851013, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7801396250724792, + "rewards/length2tails_reward/std": 0.26757025718688965, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.28125, + "completions/mean_terminated_length": 272.28125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09903733339160681, + "epoch": 2.324, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0889928787946701, + "learning_rate": 1.1456601729597735e-06, + "loss": -0.0051, + "num_tokens": 10125859.0, + "reward": 13.514886856079102, + "reward_std": 1.2132494449615479, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.768134593963623, + "rewards/length2tails_reward/std": 0.2764660120010376, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.377768039703369, + "rewards/thermo_reward/std": 1.0657364130020142, + "step": 1162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.3125, + "completions/mean_terminated_length": 273.3125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.0959858438000083, + "epoch": 2.326, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13234223425388336, + "learning_rate": 1.1443914467931734e-06, + "loss": 0.0014, + "num_tokens": 10134637.0, + "reward": 13.181429862976074, + "reward_std": 2.5910279750823975, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.650642991065979, + "rewards/kidney_reward/mean": 2.4817724227905273, + "rewards/kidney_reward/std": 0.8185869455337524, + "rewards/length2tails_reward/mean": 0.823712944984436, + "rewards/length2tails_reward/std": 0.24324999749660492, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2711193561553955, + "rewards/thermo_reward/std": 1.2079739570617676, + "step": 1163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.8125, + "completions/mean_terminated_length": 271.8125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.0841765240766108, + "epoch": 2.328, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20495650172233582, + "learning_rate": 1.1431224832111194e-06, + "loss": 0.0053, + "num_tokens": 10143367.0, + "reward": 13.428060531616211, + "reward_std": 2.060119867324829, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.5146913528442383, + "rewards/kidney_reward/std": 0.6323707103729248, + "rewards/length2tails_reward/mean": 0.7269923686981201, + "rewards/length2tails_reward/std": 0.30764666199684143, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4369945526123047, + "rewards/thermo_reward/std": 1.153154730796814, + "step": 1164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 270.96875, + "completions/mean_terminated_length": 270.96875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.08938687853515148, + "epoch": 2.33, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16077359020709991, + "learning_rate": 1.141853284300103e-06, + "loss": 0.0022, + "num_tokens": 10152070.0, + "reward": 13.645215034484863, + "reward_std": 0.6276077628135681, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7494490742683411, + "rewards/length2tails_reward/std": 0.3037169277667999, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5099644660949707, + "rewards/thermo_reward/std": 0.5615194439888, + "step": 1165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.1875, + "completions/mean_terminated_length": 272.1875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08419801201671362, + "epoch": 2.332, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3486080467700958, + "learning_rate": 1.1405838521470028e-06, + "loss": 0.0009, + "num_tokens": 10160812.0, + "reward": 13.514474868774414, + "reward_std": 1.1845166683197021, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7699288725852966, + "rewards/length2tails_reward/std": 0.27235648036003113, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4045357704162598, + "rewards/thermo_reward/std": 1.1235049962997437, + "step": 1166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09492563083767891, + "epoch": 2.334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08199910819530487, + "learning_rate": 1.1393141888390813e-06, + "loss": -0.0037, + "num_tokens": 10169570.0, + "reward": 13.191213607788086, + "reward_std": 2.442258834838867, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.3957366943359375, + "rewards/kidney_reward/std": 1.0089517831802368, + "rewards/length2tails_reward/mean": 0.7714502811431885, + "rewards/length2tails_reward/std": 0.2730657458305359, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2571463584899902, + "rewards/thermo_reward/std": 1.5108851194381714, + "step": 1167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.96875, + "completions/mean_terminated_length": 272.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08890333492308855, + "epoch": 2.336, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12107077986001968, + "learning_rate": 1.1380442964639804e-06, + "loss": 0.0003, + "num_tokens": 10178337.0, + "reward": 12.145477294921875, + "reward_std": 4.592377185821533, + "rewards/fitness_reward/mean": 7.028352737426758, + "rewards/fitness_reward/std": 1.8827823400497437, + "rewards/kidney_reward/mean": 2.28318452835083, + "rewards/kidney_reward/std": 1.0548654794692993, + "rewards/length2tails_reward/mean": 0.7793346643447876, + "rewards/length2tails_reward/std": 0.3058479130268097, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.6560065746307373, + "rewards/thermo_reward/std": 2.263650417327881, + "step": 1168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.0625, + "completions/mean_terminated_length": 273.0625, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.10207834374159575, + "epoch": 2.338, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12703408300876617, + "learning_rate": 1.1367741771097197e-06, + "loss": 0.0024, + "num_tokens": 10187107.0, + "reward": 13.797447204589844, + "reward_std": 0.4676748812198639, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8015918135643005, + "rewards/length2tails_reward/std": 0.20477145910263062, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 271.75, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "entropy": 0.09466278459876776, + "epoch": 2.34, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0807986930012703, + "learning_rate": 1.135503832864691e-06, + "loss": -0.0059, + "num_tokens": 10195835.0, + "reward": 12.499326705932617, + "reward_std": 3.7159574031829834, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.347208023071289, + "rewards/kidney_reward/std": 0.8026228547096252, + "rewards/length2tails_reward/mean": 0.7313401699066162, + "rewards/length2tails_reward/std": 0.3534490168094635, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9259305000305176, + "rewards/thermo_reward/std": 1.93488347530365, + "step": 1170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09521954506635666, + "epoch": 2.342, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12933559715747833, + "learning_rate": 1.1342332658176555e-06, + "loss": -0.0027, + "num_tokens": 10204570.0, + "reward": 13.178197860717773, + "reward_std": 4.378511905670166, + "rewards/fitness_reward/mean": 7.008818626403809, + "rewards/fitness_reward/std": 1.993286371231079, + "rewards/kidney_reward/mean": 2.441524028778076, + "rewards/kidney_reward/std": 1.0462663173675537, + "rewards/length2tails_reward/mean": 0.7510663270950317, + "rewards/length2tails_reward/std": 0.26452168822288513, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.552748441696167, + "rewards/thermo_reward/std": 1.3490850925445557, + "step": 1171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 273.3125, + "completions/mean_terminated_length": 273.3125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08618407044559717, + "epoch": 2.344, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06609036028385162, + "learning_rate": 1.1329624780577425e-06, + "loss": 0.0004, + "num_tokens": 10213348.0, + "reward": 13.34747314453125, + "reward_std": 1.7441539764404297, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5383336544036865, + "rewards/kidney_reward/std": 0.49862852692604065, + "rewards/length2tails_reward/mean": 0.8297273516654968, + "rewards/length2tails_reward/std": 0.24132372438907623, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.264981746673584, + "rewards/thermo_reward/std": 1.4160000085830688, + "step": 1172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.71875, + "completions/mean_terminated_length": 271.71875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09455079026520252, + "epoch": 2.346, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11646193265914917, + "learning_rate": 1.1316914716744426e-06, + "loss": -0.006, + "num_tokens": 10222075.0, + "reward": 13.316307067871094, + "reward_std": 1.4536793231964111, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7493178844451904, + "rewards/length2tails_reward/std": 0.2923212945461273, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1537110805511475, + "rewards/thermo_reward/std": 1.4431830644607544, + "step": 1173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.40625, + "completions/mean_terminated_length": 272.40625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08919279649853706, + "epoch": 2.348, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1078300029039383, + "learning_rate": 1.1304202487576066e-06, + "loss": 0.0005, + "num_tokens": 10230824.0, + "reward": 13.742897033691406, + "reward_std": 0.6213005781173706, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8032835125923157, + "rewards/length2tails_reward/std": 0.23592759668827057, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.4375, + "completions/mean_terminated_length": 272.4375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.102267200127244, + "epoch": 2.35, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06344044208526611, + "learning_rate": 1.1291488113974415e-06, + "loss": -0.0041, + "num_tokens": 10239574.0, + "reward": 13.540192604064941, + "reward_std": 1.4909051656723022, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7741001844406128, + "rewards/length2tails_reward/std": 0.24528786540031433, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4024767875671387, + "rewards/thermo_reward/std": 1.3338748216629028, + "step": 1175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09431599359959364, + "epoch": 2.352, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09179411083459854, + "learning_rate": 1.127877161684506e-06, + "loss": -0.0008, + "num_tokens": 10248309.0, + "reward": 13.469427108764648, + "reward_std": 1.0344244241714478, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7367984056472778, + "rewards/length2tails_reward/std": 0.27189457416534424, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.335442543029785, + "rewards/thermo_reward/std": 0.9056594371795654, + "step": 1176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.84375, + "completions/mean_terminated_length": 272.84375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08454183116555214, + "epoch": 2.354, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10728532820940018, + "learning_rate": 1.126605301709709e-06, + "loss": -0.0042, + "num_tokens": 10257072.0, + "reward": 13.568931579589844, + "reward_std": 1.9710923433303833, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5000758171081543, + "rewards/kidney_reward/std": 0.7150477766990662, + "rewards/length2tails_reward/mean": 0.7729551792144775, + "rewards/length2tails_reward/std": 0.25684475898742676, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.530374526977539, + "rewards/thermo_reward/std": 1.26329505443573, + "step": 1177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.84375, + "completions/mean_terminated_length": 272.84375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09735297691076994, + "epoch": 2.356, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1453525424003601, + "learning_rate": 1.1253332335643042e-06, + "loss": 0.0013, + "num_tokens": 10265835.0, + "reward": 13.430132865905762, + "reward_std": 1.3856492042541504, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.7720500230789185, + "rewards/length2tails_reward/std": 0.25347065925598145, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3473410606384277, + "rewards/thermo_reward/std": 1.158356785774231, + "step": 1178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09220325388014317, + "epoch": 2.358, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1444585621356964, + "learning_rate": 1.1240609593398884e-06, + "loss": -0.0003, + "num_tokens": 10274570.0, + "reward": 12.636201858520508, + "reward_std": 5.048321723937988, + "rewards/fitness_reward/mean": 7.012203216552734, + "rewards/fitness_reward/std": 1.9741390943527222, + "rewards/kidney_reward/mean": 2.2174811363220215, + "rewards/kidney_reward/std": 1.506496548652649, + "rewards/length2tails_reward/mean": 0.7474431395530701, + "rewards/length2tails_reward/std": 0.31676745414733887, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2317728996276855, + "rewards/thermo_reward/std": 1.8967760801315308, + "step": 1179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.65625, + "completions/mean_terminated_length": 272.65625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.0881521599367261, + "epoch": 2.36, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12496333569288254, + "learning_rate": 1.122788481128397e-06, + "loss": -0.0008, + "num_tokens": 10283327.0, + "reward": 13.534195899963379, + "reward_std": 1.1190105676651, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.784372866153717, + "rewards/length2tails_reward/std": 0.25296303629875183, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.368093967437744, + "rewards/thermo_reward/std": 1.1134434938430786, + "step": 1180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.40625, + "completions/mean_terminated_length": 271.40625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.08923059329390526, + "epoch": 2.362, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.061542972922325134, + "learning_rate": 1.1215158010221004e-06, + "loss": -0.0036, + "num_tokens": 10292044.0, + "reward": 13.394866943359375, + "reward_std": 2.238596200942993, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.5037803649902344, + "rewards/kidney_reward/std": 0.6940914392471313, + "rewards/length2tails_reward/mean": 0.6941548585891724, + "rewards/length2tails_reward/std": 0.35992923378944397, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4179959297180176, + "rewards/thermo_reward/std": 1.2520838975906372, + "step": 1181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 287.71875, + "completions/mean_terminated_length": 272.6773986816406, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1059657009318471, + "epoch": 2.364, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5759685635566711, + "learning_rate": 1.1202429211136011e-06, + "loss": -0.0212, + "num_tokens": 10301283.0, + "reward": 13.834796905517578, + "reward_std": 0.4251575767993927, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7762397527694702, + "rewards/length2tails_reward/std": 0.2769671082496643, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09039780683815479, + "epoch": 2.366, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1149810180068016, + "learning_rate": 1.1189698434958308e-06, + "loss": 0.0005, + "num_tokens": 10310027.0, + "reward": 13.576425552368164, + "reward_std": 1.3609395027160645, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7675727605819702, + "rewards/length2tails_reward/std": 0.2958059012889862, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4393632411956787, + "rewards/thermo_reward/std": 1.2457044124603271, + "step": 1183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.5625, + "completions/mean_terminated_length": 273.5625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08911374118179083, + "epoch": 2.368, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10090884566307068, + "learning_rate": 1.1176965702620453e-06, + "loss": 0.0011, + "num_tokens": 10318813.0, + "reward": 13.879654884338379, + "reward_std": 0.37785404920578003, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8259574174880981, + "rewards/length2tails_reward/std": 0.26726025342941284, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.1875, + "completions/mean_terminated_length": 272.1875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09442420396953821, + "epoch": 2.37, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09971757233142853, + "learning_rate": 1.1164231035058227e-06, + "loss": -0.0011, + "num_tokens": 10327555.0, + "reward": 13.422538757324219, + "reward_std": 1.3540540933609009, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7836484909057617, + "rewards/length2tails_reward/std": 0.2161341905593872, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2838685512542725, + "rewards/thermo_reward/std": 1.246479868888855, + "step": 1185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.9375, + "completions/mean_terminated_length": 272.9375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.10214246436953545, + "epoch": 2.372, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1652139127254486, + "learning_rate": 1.1151494453210594e-06, + "loss": -0.0042, + "num_tokens": 10336321.0, + "reward": 13.216739654541016, + "reward_std": 2.033397912979126, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.510984420776367, + "rewards/kidney_reward/std": 0.5172518491744995, + "rewards/length2tails_reward/mean": 0.8202112913131714, + "rewards/length2tails_reward/std": 0.19941861927509308, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1625497341156006, + "rewards/thermo_reward/std": 1.6370229721069336, + "step": 1186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.46875, + "completions/mean_terminated_length": 272.46875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10318686440587044, + "epoch": 2.374, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12082388252019882, + "learning_rate": 1.1138755978019656e-06, + "loss": 0.0042, + "num_tokens": 10345072.0, + "reward": 13.59936237335205, + "reward_std": 1.057921051979065, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7763901948928833, + "rewards/length2tails_reward/std": 0.2672812044620514, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4614181518554688, + "rewards/thermo_reward/std": 1.0285004377365112, + "step": 1187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.09375, + "completions/mean_terminated_length": 272.09375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09636431373655796, + "epoch": 2.376, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06303004920482635, + "learning_rate": 1.1126015630430642e-06, + "loss": -0.0074, + "num_tokens": 10353811.0, + "reward": 13.065311431884766, + "reward_std": 3.1626193523406982, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.511730670928955, + "rewards/kidney_reward/std": 0.5132253766059875, + "rewards/length2tails_reward/mean": 0.7315313220024109, + "rewards/length2tails_reward/std": 0.31727486848831177, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.32737398147583, + "rewards/thermo_reward/std": 1.237959861755371, + "step": 1188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 266.375, + "completions/mean_terminated_length": 266.375, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 0.08927316032350063, + "epoch": 2.378, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4410877227783203, + "learning_rate": 1.1113273431391847e-06, + "loss": -0.1267, + "num_tokens": 10362367.0, + "reward": 12.523245811462402, + "reward_std": 5.3058390617370605, + "rewards/fitness_reward/mean": 6.6293439865112305, + "rewards/fitness_reward/std": 2.8821799755096436, + "rewards/kidney_reward/mean": 2.3808608055114746, + "rewards/kidney_reward/std": 1.239362120628357, + "rewards/length2tails_reward/mean": 0.8259913325309753, + "rewards/length2tails_reward/std": 0.2122562676668167, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.330442428588867, + "rewards/thermo_reward/std": 1.5098158121109009, + "step": 1189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.28125, + "completions/mean_terminated_length": 272.28125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09383781487122178, + "epoch": 2.38, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08897305279970169, + "learning_rate": 1.1100529401854616e-06, + "loss": -0.0058, + "num_tokens": 10371112.0, + "reward": 13.609582901000977, + "reward_std": 0.6566846966743469, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7919836640357971, + "rewards/length2tails_reward/std": 0.25463926792144775, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.470078706741333, + "rewards/thermo_reward/std": 0.5830413699150085, + "step": 1190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.5625, + "completions/mean_terminated_length": 272.5625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09251491725444794, + "epoch": 2.382, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1091945618391037, + "learning_rate": 1.108778356277331e-06, + "loss": 0.0003, + "num_tokens": 10379866.0, + "reward": 13.770999908447266, + "reward_std": 0.55600905418396, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8107157945632935, + "rewards/length2tails_reward/std": 0.24261504411697388, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08884621225297451, + "epoch": 2.384, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08674254268407822, + "learning_rate": 1.107503593510525e-06, + "loss": -0.0052, + "num_tokens": 10388587.0, + "reward": 12.903682708740234, + "reward_std": 4.312568187713623, + "rewards/fitness_reward/mean": 7.004302978515625, + "rewards/fitness_reward/std": 2.018831729888916, + "rewards/kidney_reward/mean": 2.4614100456237793, + "rewards/kidney_reward/std": 0.9337738752365112, + "rewards/length2tails_reward/mean": 0.6988670825958252, + "rewards/length2tails_reward/std": 0.32447734475135803, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2680823802948, + "rewards/thermo_reward/std": 1.4241454601287842, + "step": 1192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.08987856283783913, + "epoch": 2.386, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07334306091070175, + "learning_rate": 1.106228653981071e-06, + "loss": -0.0067, + "num_tokens": 10397318.0, + "reward": 13.505498886108398, + "reward_std": 0.9637717008590698, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7330926656723022, + "rewards/length2tails_reward/std": 0.3236958384513855, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.371884346008301, + "rewards/thermo_reward/std": 0.9134882688522339, + "step": 1193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.34375, + "completions/mean_terminated_length": 273.34375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09814102575182915, + "epoch": 2.388, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11979229003190994, + "learning_rate": 1.1049535397852871e-06, + "loss": 0.0004, + "num_tokens": 10406097.0, + "reward": 13.654312133789062, + "reward_std": 0.6249982714653015, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8404203653335571, + "rewards/length2tails_reward/std": 0.22256071865558624, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.509964942932129, + "rewards/thermo_reward/std": 0.5615193843841553, + "step": 1194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.3125, + "completions/mean_terminated_length": 272.3125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0880571873858571, + "epoch": 2.39, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0868200808763504, + "learning_rate": 1.1036782530197775e-06, + "loss": 0.0036, + "num_tokens": 10414843.0, + "reward": 13.65894889831543, + "reward_std": 0.6650402545928955, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7615159749984741, + "rewards/length2tails_reward/std": 0.25980833172798157, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5498504638671875, + "rewards/thermo_reward/std": 0.5360844731330872, + "step": 1195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.625, + "completions/mean_terminated_length": 273.625, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "entropy": 0.09931143745779991, + "epoch": 2.392, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06371911615133286, + "learning_rate": 1.1024027957814312e-06, + "loss": -0.0045, + "num_tokens": 10423631.0, + "reward": 13.089361190795898, + "reward_std": 3.0056564807891846, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.511730670928955, + "rewards/kidney_reward/std": 0.5132253766059875, + "rewards/length2tails_reward/mean": 0.8789088726043701, + "rewards/length2tails_reward/std": 0.20397359132766724, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.336686372756958, + "rewards/thermo_reward/std": 0.9002619385719299, + "step": 1196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.53125, + "completions/mean_terminated_length": 272.53125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09386175684630871, + "epoch": 2.394, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09913461655378342, + "learning_rate": 1.1011271701674176e-06, + "loss": -0.0059, + "num_tokens": 10432384.0, + "reward": 13.630131721496582, + "reward_std": 0.9156928062438965, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7360557317733765, + "rewards/length2tails_reward/std": 0.3140716552734375, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4962210655212402, + "rewards/thermo_reward/std": 0.8580461740493774, + "step": 1197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.90625, + "completions/mean_terminated_length": 272.90625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09354142751544714, + "epoch": 2.396, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10588572919368744, + "learning_rate": 1.099851378275182e-06, + "loss": 0.005, + "num_tokens": 10441149.0, + "reward": 13.799158096313477, + "reward_std": 0.466749906539917, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8187099695205688, + "rewards/length2tails_reward/std": 0.1727634072303772, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.125, + "completions/mean_terminated_length": 271.125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08997735194861889, + "epoch": 2.398, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08149711787700653, + "learning_rate": 1.0985754222024436e-06, + "loss": -0.0021, + "num_tokens": 10449857.0, + "reward": 13.479462623596191, + "reward_std": 1.150247573852539, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.6816555261611938, + "rewards/length2tails_reward/std": 0.3275473713874817, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.323631763458252, + "rewards/thermo_reward/std": 1.1417499780654907, + "step": 1199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.09375, + "completions/mean_terminated_length": 271.09375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.09271670412272215, + "epoch": 2.4, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0913810133934021, + "learning_rate": 1.0972993040471917e-06, + "loss": -0.0008, + "num_tokens": 10458564.0, + "reward": 13.827293395996094, + "reward_std": 0.4297630786895752, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7012042999267578, + "rewards/length2tails_reward/std": 0.305792897939682, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.03125, + "completions/mean_terminated_length": 272.03125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08737404271960258, + "epoch": 2.402, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09364177286624908, + "learning_rate": 1.0960230259076817e-06, + "loss": 0.0026, + "num_tokens": 10467301.0, + "reward": 13.834736824035645, + "reward_std": 0.42323192954063416, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7756407260894775, + "rewards/length2tails_reward/std": 0.25788721442222595, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 271.21875, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "entropy": 0.09901608899235725, + "epoch": 2.404, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08422534912824631, + "learning_rate": 1.0947465898824328e-06, + "loss": -0.0038, + "num_tokens": 10476012.0, + "reward": 13.394966125488281, + "reward_std": 2.5611391067504883, + "rewards/fitness_reward/mean": 6.979666709899902, + "rewards/fitness_reward/std": 2.158194065093994, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8655651211738586, + "rewards/length2tails_reward/std": 0.21018658578395844, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.90625, + "completions/mean_terminated_length": 272.90625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09906727354973555, + "epoch": 2.406, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06459254771471024, + "learning_rate": 1.093469998070223e-06, + "loss": -0.0053, + "num_tokens": 10484777.0, + "reward": 13.304363250732422, + "reward_std": 3.006488084793091, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.5390896797180176, + "rewards/kidney_reward/std": 0.49435171484947205, + "rewards/length2tails_reward/mean": 0.7610428333282471, + "rewards/length2tails_reward/std": 0.3280724883079529, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.536116123199463, + "rewards/thermo_reward/std": 0.8422486186027527, + "step": 1203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.0, + "completions/mean_terminated_length": 272.0, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09012809442356229, + "epoch": 2.408, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16831642389297485, + "learning_rate": 1.0921932525700868e-06, + "loss": 0.0009, + "num_tokens": 10493513.0, + "reward": 13.215476036071777, + "reward_std": 1.5913658142089844, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7565579414367676, + "rewards/length2tails_reward/std": 0.25465884804725647, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1068739891052246, + "rewards/thermo_reward/std": 1.4153202772140503, + "step": 1204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.78125, + "completions/mean_terminated_length": 273.78125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08467008266597986, + "epoch": 2.41, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08127164095640182, + "learning_rate": 1.0909163554813116e-06, + "loss": 0.001, + "num_tokens": 10502306.0, + "reward": 13.799966812133789, + "reward_std": 0.47498491406440735, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8267866969108582, + "rewards/length2tails_reward/std": 0.26131075620651245, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.03125, + "completions/mean_terminated_length": 272.03125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09263253770768642, + "epoch": 2.412, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09587337076663971, + "learning_rate": 1.0896393089034335e-06, + "loss": -0.001, + "num_tokens": 10511043.0, + "reward": 13.44931411743164, + "reward_std": 1.5483500957489014, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7766602039337158, + "rewards/length2tails_reward/std": 0.2561853229999542, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3113417625427246, + "rewards/thermo_reward/std": 1.403664469718933, + "step": 1206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08940061461180449, + "epoch": 2.414, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08128293603658676, + "learning_rate": 1.088362114936235e-06, + "loss": -0.0026, + "num_tokens": 10519778.0, + "reward": 13.449129104614258, + "reward_std": 1.4584228992462158, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.7604248523712158, + "rewards/length2tails_reward/std": 0.30569180846214294, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.367499589920044, + "rewards/thermo_reward/std": 1.2244524955749512, + "step": 1207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09399153850972652, + "epoch": 2.416, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14469492435455322, + "learning_rate": 1.0870847756797404e-06, + "loss": -0.0013, + "num_tokens": 10528542.0, + "reward": 13.58527660369873, + "reward_std": 1.3392078876495361, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7924588322639465, + "rewards/length2tails_reward/std": 0.2551293969154358, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4730844497680664, + "rewards/thermo_reward/std": 1.1635582447052002, + "step": 1208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.5625, + "completions/mean_terminated_length": 272.5625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08919752668589354, + "epoch": 2.418, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08864497393369675, + "learning_rate": 1.0858072932342132e-06, + "loss": -0.0005, + "num_tokens": 10537296.0, + "reward": 12.8340425491333, + "reward_std": 4.473905563354492, + "rewards/fitness_reward/mean": 7.023777484893799, + "rewards/fitness_reward/std": 1.9086664915084839, + "rewards/kidney_reward/mean": 2.4164366722106934, + "rewards/kidney_reward/std": 1.0399911403656006, + "rewards/length2tails_reward/mean": 0.7797451019287109, + "rewards/length2tails_reward/std": 0.2637164294719696, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2158541679382324, + "rewards/thermo_reward/std": 1.8010083436965942, + "step": 1209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 275.09375, + "completions/mean_terminated_length": 275.09375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08536658063530922, + "epoch": 2.42, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10550517588853836, + "learning_rate": 1.0845296697001527e-06, + "loss": 0.0029, + "num_tokens": 10546131.0, + "reward": 13.507648468017578, + "reward_std": 1.3550305366516113, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8864538073539734, + "rewards/length2tails_reward/std": 0.2049570232629776, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.386056900024414, + "rewards/thermo_reward/std": 1.1557670831680298, + "step": 1210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.71875, + "completions/mean_terminated_length": 272.71875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0972055597230792, + "epoch": 2.422, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09713931381702423, + "learning_rate": 1.0832519071782892e-06, + "loss": 0.0013, + "num_tokens": 10554890.0, + "reward": 13.635648727416992, + "reward_std": 0.583570122718811, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7790554761886597, + "rewards/length2tails_reward/std": 0.3008067309856415, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.470078706741333, + "rewards/thermo_reward/std": 0.5830413699150085, + "step": 1211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.1875, + "completions/mean_terminated_length": 272.1875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08462123945355415, + "epoch": 2.424, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07095858454704285, + "learning_rate": 1.0819740077695824e-06, + "loss": -0.0026, + "num_tokens": 10563632.0, + "reward": 13.505558013916016, + "reward_std": 1.648931860923767, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7744401693344116, + "rewards/length2tails_reward/std": 0.3092653453350067, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.395167350769043, + "rewards/thermo_reward/std": 1.4293804168701172, + "step": 1212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.53125, + "completions/mean_terminated_length": 272.53125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09231602028012276, + "epoch": 2.426, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11986473947763443, + "learning_rate": 1.0806959735752173e-06, + "loss": 0.0009, + "num_tokens": 10572385.0, + "reward": 13.415735244750977, + "reward_std": 1.188355565071106, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.746727705001831, + "rewards/length2tails_reward/std": 0.320938378572464, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2807564735412598, + "rewards/thermo_reward/std": 1.1604331731796265, + "step": 1213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.15625, + "completions/mean_terminated_length": 271.15625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0945893507450819, + "epoch": 2.428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12068387120962143, + "learning_rate": 1.0794178066965993e-06, + "loss": -0.0038, + "num_tokens": 10581094.0, + "reward": 13.645952224731445, + "reward_std": 1.5051319599151611, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.531928062438965, + "rewards/kidney_reward/std": 0.5348639488220215, + "rewards/length2tails_reward/mean": 0.6977065801620483, + "rewards/length2tails_reward/std": 0.31715697050094604, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.583068609237671, + "rewards/thermo_reward/std": 0.9773446917533875, + "step": 1214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.59375, + "completions/mean_terminated_length": 273.59375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08710552845150232, + "epoch": 2.43, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11045100539922714, + "learning_rate": 1.0781395092353525e-06, + "loss": -0.0033, + "num_tokens": 10589881.0, + "reward": 13.563175201416016, + "reward_std": 1.3086121082305908, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8308136463165283, + "rewards/length2tails_reward/std": 0.2591162621974945, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.477297306060791, + "rewards/thermo_reward/std": 1.1414297819137573, + "step": 1215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09739651903510094, + "epoch": 2.432, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1206652969121933, + "learning_rate": 1.0768610832933167e-06, + "loss": 0.0002, + "num_tokens": 10598639.0, + "reward": 13.21466064453125, + "reward_std": 2.480855941772461, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.4809939861297607, + "rewards/kidney_reward/std": 0.8229905962944031, + "rewards/length2tails_reward/mean": 0.7944443225860596, + "rewards/length2tails_reward/std": 0.25818219780921936, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.25054669380188, + "rewards/thermo_reward/std": 1.5193787813186646, + "step": 1216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.90625, + "completions/mean_terminated_length": 273.90625, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.08703937754034996, + "epoch": 2.434, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1481795758008957, + "learning_rate": 1.0755825309725415e-06, + "loss": 0.0023, + "num_tokens": 10607436.0, + "reward": 13.805276870727539, + "reward_std": 0.468355268239975, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8799004554748535, + "rewards/length2tails_reward/std": 0.12640152871608734, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.40625, + "completions/mean_terminated_length": 272.40625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09083101619035006, + "epoch": 2.436, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1297113448381424, + "learning_rate": 1.0743038543752852e-06, + "loss": 0.001, + "num_tokens": 10616185.0, + "reward": 13.531122207641602, + "reward_std": 1.9644542932510376, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.5387279987335205, + "rewards/kidney_reward/std": 0.49639827013015747, + "rewards/length2tails_reward/mean": 0.7927061319351196, + "rewards/length2tails_reward/std": 0.2188922017812729, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5094475746154785, + "rewards/thermo_reward/std": 1.1716644763946533, + "step": 1218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 271.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08744846004992723, + "epoch": 2.438, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05081085115671158, + "learning_rate": 1.07302505560401e-06, + "loss": -0.0053, + "num_tokens": 10624913.0, + "reward": 13.099504470825195, + "reward_std": 3.1085195541381836, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.5390896797180176, + "rewards/kidney_reward/std": 0.49435171484947205, + "rewards/length2tails_reward/mean": 0.757485568523407, + "rewards/length2tails_reward/std": 0.2631629407405853, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3316128253936768, + "rewards/thermo_reward/std": 1.2207468748092651, + "step": 1219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.46875, + "completions/mean_terminated_length": 272.46875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09191823564469814, + "epoch": 2.44, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13226445019245148, + "learning_rate": 1.0717461367613792e-06, + "loss": -0.0005, + "num_tokens": 10633664.0, + "reward": 13.734261512756348, + "reward_std": 0.8752428889274597, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8042384386062622, + "rewards/length2tails_reward/std": 0.23499715328216553, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5661725997924805, + "rewards/thermo_reward/std": 0.8740096092224121, + "step": 1220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.15625, + "completions/mean_terminated_length": 273.15625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09383957739919424, + "epoch": 2.442, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11894011497497559, + "learning_rate": 1.0704670999502538e-06, + "loss": 0.0035, + "num_tokens": 10642437.0, + "reward": 13.813368797302246, + "reward_std": 0.5172167420387268, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8355466723442078, + "rewards/length2tails_reward/std": 0.21538470685482025, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.65625, + "completions/mean_terminated_length": 272.65625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09643411543220282, + "epoch": 2.444, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1086791530251503, + "learning_rate": 1.0691879472736883e-06, + "loss": 0.0005, + "num_tokens": 10651194.0, + "reward": 13.303426742553711, + "reward_std": 1.154916524887085, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7456244826316833, + "rewards/length2tails_reward/std": 0.30064287781715393, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.195917844772339, + "rewards/thermo_reward/std": 1.0026649236679077, + "step": 1222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.28125, + "completions/mean_terminated_length": 273.28125, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.09604166820645332, + "epoch": 2.446, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14771340787410736, + "learning_rate": 1.0679086808349277e-06, + "loss": -0.0001, + "num_tokens": 10659971.0, + "reward": 13.22848892211914, + "reward_std": 2.40313458442688, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.4679837226867676, + "rewards/kidney_reward/std": 0.7528902888298035, + "rewards/length2tails_reward/mean": 0.847961962223053, + "rewards/length2tails_reward/std": 0.18185418844223022, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2720327377319336, + "rewards/thermo_reward/std": 1.4439270496368408, + "step": 1223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 268.25, + "completions/mean_terminated_length": 268.25, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.09932470787316561, + "epoch": 2.448, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3754037618637085, + "learning_rate": 1.0666293027374043e-06, + "loss": -0.0708, + "num_tokens": 10668587.0, + "reward": 12.5902099609375, + "reward_std": 5.2144060134887695, + "rewards/fitness_reward/mean": 6.969270706176758, + "rewards/fitness_reward/std": 2.2170021533966064, + "rewards/kidney_reward/mean": 2.2978575229644775, + "rewards/kidney_reward/std": 1.3816429376602173, + "rewards/length2tails_reward/mean": 0.7470918893814087, + "rewards/length2tails_reward/std": 0.27570077776908875, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1483712196350098, + "rewards/thermo_reward/std": 1.9086298942565918, + "step": 1224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.46875, + "completions/mean_terminated_length": 273.46875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0799891110509634, + "epoch": 2.45, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13458140194416046, + "learning_rate": 1.0653498150847342e-06, + "loss": -0.0015, + "num_tokens": 10677370.0, + "reward": 13.183582305908203, + "reward_std": 2.0887210369110107, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5188488960266113, + "rewards/kidney_reward/std": 0.6088516116142273, + "rewards/length2tails_reward/mean": 0.8200899958610535, + "rewards/length2tails_reward/std": 0.260699599981308, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.12153959274292, + "rewards/thermo_reward/std": 1.5782240629196167, + "step": 1225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08691063802689314, + "epoch": 2.452, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1246851459145546, + "learning_rate": 1.064070219980713e-06, + "loss": -0.0003, + "num_tokens": 10686128.0, + "reward": 13.246986389160156, + "reward_std": 1.9061574935913086, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4933767318725586, + "rewards/kidney_reward/std": 0.6130226850509644, + "rewards/length2tails_reward/mean": 0.7921514511108398, + "rewards/length2tails_reward/std": 0.25093090534210205, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.213209867477417, + "rewards/thermo_reward/std": 1.4650269746780396, + "step": 1226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.78125, + "completions/mean_terminated_length": 272.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08808344416320324, + "epoch": 2.454, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0639815554022789, + "learning_rate": 1.0627905195293135e-06, + "loss": -0.0061, + "num_tokens": 10694889.0, + "reward": 12.932845115661621, + "reward_std": 3.411823272705078, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.4229860305786133, + "rewards/kidney_reward/std": 0.7031325697898865, + "rewards/length2tails_reward/mean": 0.786348819732666, + "rewards/length2tails_reward/std": 0.23730994760990143, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.278170585632324, + "rewards/thermo_reward/std": 1.4518730640411377, + "step": 1227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.90625, + "completions/mean_terminated_length": 272.90625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0898923072963953, + "epoch": 2.456, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0836012214422226, + "learning_rate": 1.0615107158346814e-06, + "loss": -0.0034, + "num_tokens": 10703654.0, + "reward": 13.64627742767334, + "reward_std": 0.9476872086524963, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7877658009529114, + "rewards/length2tails_reward/std": 0.27643802762031555, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4798367023468018, + "rewards/thermo_reward/std": 0.9370023608207703, + "step": 1228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.59375, + "completions/mean_terminated_length": 271.59375, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.09534856677055359, + "epoch": 2.458, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14600451290607452, + "learning_rate": 1.0602308110011326e-06, + "loss": 0.0038, + "num_tokens": 10712377.0, + "reward": 13.80593490600586, + "reward_std": 0.516539990901947, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7612037658691406, + "rewards/length2tails_reward/std": 0.3153897821903229, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 272.0, + "completions/mean_terminated_length": 272.0, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10472780279815197, + "epoch": 2.46, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28879615664482117, + "learning_rate": 1.0589508071331486e-06, + "loss": 0.0049, + "num_tokens": 10721113.0, + "reward": 12.659233093261719, + "reward_std": 4.623455047607422, + "rewards/fitness_reward/mean": 6.967202663421631, + "rewards/fitness_reward/std": 1.920608401298523, + "rewards/kidney_reward/mean": 2.3063979148864746, + "rewards/kidney_reward/std": 1.1783033609390259, + "rewards/length2tails_reward/mean": 0.778078019618988, + "rewards/length2tails_reward/std": 0.24348245561122894, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.20782470703125, + "rewards/thermo_reward/std": 1.7583760023117065, + "step": 1230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.96875, + "completions/mean_terminated_length": 272.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09287342242896557, + "epoch": 2.462, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17823663353919983, + "learning_rate": 1.0576707063353745e-06, + "loss": 0.0003, + "num_tokens": 10729880.0, + "reward": 12.828574180603027, + "reward_std": 5.078460693359375, + "rewards/fitness_reward/mean": 6.965703964233398, + "rewards/fitness_reward/std": 2.23717999458313, + "rewards/kidney_reward/mean": 2.368088722229004, + "rewards/kidney_reward/std": 1.3110778331756592, + "rewards/length2tails_reward/mean": 0.7966312170028687, + "rewards/length2tails_reward/std": 0.2715126872062683, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3151190280914307, + "rewards/thermo_reward/std": 1.5917843580245972, + "step": 1231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08647519163787365, + "epoch": 2.464, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06193329393863678, + "learning_rate": 1.0563905107126144e-06, + "loss": -0.0067, + "num_tokens": 10738644.0, + "reward": 13.224793434143066, + "reward_std": 2.5808629989624023, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4222705364227295, + "rewards/kidney_reward/std": 0.8070963025093079, + "rewards/length2tails_reward/mean": 0.7824023962020874, + "rewards/length2tails_reward/std": 0.2957296669483185, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2630972862243652, + "rewards/thermo_reward/std": 1.7700457572937012, + "step": 1232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.15625, + "completions/mean_terminated_length": 272.15625, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 0.10227520670741796, + "epoch": 2.466, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17705994844436646, + "learning_rate": 1.055110222369828e-06, + "loss": 0.0074, + "num_tokens": 10747385.0, + "reward": 13.770784378051758, + "reward_std": 0.546610414981842, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8085607290267944, + "rewards/length2tails_reward/std": 0.23666897416114807, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.59375, + "completions/mean_terminated_length": 273.59375, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.10402582865208387, + "epoch": 2.468, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22149960696697235, + "learning_rate": 1.0538298434121282e-06, + "loss": -0.0019, + "num_tokens": 10756172.0, + "reward": 13.709596633911133, + "reward_std": 0.6485846042633057, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8691390752792358, + "rewards/length2tails_reward/std": 0.17218910157680511, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 1234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.84375, + "completions/mean_terminated_length": 272.84375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09120804257690907, + "epoch": 2.4699999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05785958468914032, + "learning_rate": 1.0525493759447763e-06, + "loss": -0.0047, + "num_tokens": 10764935.0, + "reward": 13.47886848449707, + "reward_std": 1.4138140678405762, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5382261276245117, + "rewards/kidney_reward/std": 0.49923640489578247, + "rewards/length2tails_reward/mean": 0.794089674949646, + "rewards/length2tails_reward/std": 0.2710154056549072, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.400047779083252, + "rewards/thermo_reward/std": 0.9587206840515137, + "step": 1235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.9375, + "completions/mean_terminated_length": 272.9375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08946628589183092, + "epoch": 2.472, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10416407883167267, + "learning_rate": 1.0512688220731792e-06, + "loss": 0.0017, + "num_tokens": 10773701.0, + "reward": 13.635832786560059, + "reward_std": 1.0921016931533813, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8250085115432739, + "rewards/length2tails_reward/std": 0.2135208547115326, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.520385980606079, + "rewards/thermo_reward/std": 0.9200055599212646, + "step": 1236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.5, + "completions/mean_terminated_length": 272.5, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08608097489923239, + "epoch": 2.474, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08293689042329788, + "learning_rate": 1.0499881839028866e-06, + "loss": -0.008, + "num_tokens": 10782453.0, + "reward": 13.533278465270996, + "reward_std": 1.0921880006790161, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7436509132385254, + "rewards/length2tails_reward/std": 0.3194071352481842, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4561173915863037, + "rewards/thermo_reward/std": 0.8726389408111572, + "step": 1237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.71875, + "completions/mean_terminated_length": 271.71875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09384459909051657, + "epoch": 2.476, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11530506610870361, + "learning_rate": 1.0487074635395853e-06, + "loss": 0.0013, + "num_tokens": 10791180.0, + "reward": 13.605308532714844, + "reward_std": 0.6402997374534607, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7492390871047974, + "rewards/length2tails_reward/std": 0.2651563882827759, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.470078706741333, + "rewards/thermo_reward/std": 0.5830413699150085, + "step": 1238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.09741722140461206, + "epoch": 2.4779999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1013156846165657, + "learning_rate": 1.0474266630890985e-06, + "loss": 0.0001, + "num_tokens": 10799938.0, + "reward": 13.702683448791504, + "reward_std": 1.0358788967132568, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8046396970748901, + "rewards/length2tails_reward/std": 0.2169589251279831, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5619144439697266, + "rewards/thermo_reward/std": 0.8957966566085815, + "step": 1239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.65625, + "completions/mean_terminated_length": 271.65625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09494194388389587, + "epoch": 2.48, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16340993344783783, + "learning_rate": 1.0461457846573809e-06, + "loss": -0.0004, + "num_tokens": 10808663.0, + "reward": 13.323081970214844, + "reward_std": 1.4894757270812988, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7303534746170044, + "rewards/length2tails_reward/std": 0.2566344141960144, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1897404193878174, + "rewards/thermo_reward/std": 1.404547095298767, + "step": 1240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.0, + "completions/mean_terminated_length": 273.0, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08389176521450281, + "epoch": 2.482, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12332534790039062, + "learning_rate": 1.044864830350515e-06, + "loss": 0.0017, + "num_tokens": 10817431.0, + "reward": 13.135833740234375, + "reward_std": 2.7119650840759277, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.46026611328125, + "rewards/kidney_reward/std": 0.7956667542457581, + "rewards/length2tails_reward/mean": 0.8329859375953674, + "rewards/length2tails_reward/std": 0.22733183205127716, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1885929107666016, + "rewards/thermo_reward/std": 1.7483952045440674, + "step": 1241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 274.28125, + "completions/mean_terminated_length": 274.28125, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.09511914569884539, + "epoch": 2.484, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08770027756690979, + "learning_rate": 1.0435838022747084e-06, + "loss": -0.0017, + "num_tokens": 10826240.0, + "reward": 13.80496883392334, + "reward_std": 0.4740360379219055, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.87681645154953, + "rewards/length2tails_reward/std": 0.15799346566200256, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.75, + "completions/mean_terminated_length": 270.75, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "entropy": 0.08547612745314837, + "epoch": 2.4859999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1876983940601349, + "learning_rate": 1.04230270253629e-06, + "loss": 0.0002, + "num_tokens": 10834936.0, + "reward": 13.095527648925781, + "reward_std": 2.6350486278533936, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.363231658935547, + "rewards/kidney_reward/std": 0.9425392746925354, + "rewards/length2tails_reward/mean": 0.7321509122848511, + "rewards/length2tails_reward/std": 0.3210483491420746, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2554047107696533, + "rewards/thermo_reward/std": 1.5079888105392456, + "step": 1243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.75, + "completions/mean_terminated_length": 270.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08697150368243456, + "epoch": 2.488, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10575957596302032, + "learning_rate": 1.0410215332417065e-06, + "loss": -0.0019, + "num_tokens": 10843632.0, + "reward": 13.676462173461914, + "reward_std": 0.9143254160881042, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.6497900485992432, + "rewards/length2tails_reward/std": 0.34322085976600647, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5238184928894043, + "rewards/thermo_reward/std": 0.9028612971305847, + "step": 1244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.75, + "completions/mean_terminated_length": 272.75, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09583816025406122, + "epoch": 2.49, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08811348676681519, + "learning_rate": 1.0397402964975186e-06, + "loss": 0.0008, + "num_tokens": 10852392.0, + "reward": 13.367749214172363, + "reward_std": 2.2184150218963623, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.4928197860717773, + "rewards/kidney_reward/std": 0.7560939788818359, + "rewards/length2tails_reward/mean": 0.8037996292114258, + "rewards/length2tails_reward/std": 0.2079600840806961, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.390873432159424, + "rewards/thermo_reward/std": 1.1930720806121826, + "step": 1245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 272.21875, + "completions/mean_terminated_length": 272.21875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.0990471001714468, + "epoch": 2.492, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06559862196445465, + "learning_rate": 1.0384589944103983e-06, + "loss": -0.0009, + "num_tokens": 10861135.0, + "reward": 13.631964683532715, + "reward_std": 1.0133236646652222, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7975431680679321, + "rewards/length2tails_reward/std": 0.2299695611000061, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4645447731018066, + "rewards/thermo_reward/std": 1.0127943754196167, + "step": 1246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.125, + "completions/mean_terminated_length": 273.125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09418590925633907, + "epoch": 2.4939999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1989491581916809, + "learning_rate": 1.037177629087125e-06, + "loss": -0.0025, + "num_tokens": 10869907.0, + "reward": 12.421655654907227, + "reward_std": 5.143921852111816, + "rewards/fitness_reward/mean": 6.928044319152832, + "rewards/fitness_reward/std": 2.1392548084259033, + "rewards/kidney_reward/mean": 2.222644567489624, + "rewards/kidney_reward/std": 1.3976283073425293, + "rewards/length2tails_reward/mean": 0.8509255647659302, + "rewards/length2tails_reward/std": 0.16113992035388947, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.085874319076538, + "rewards/thermo_reward/std": 1.8372138738632202, + "step": 1247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.4375, + "completions/mean_terminated_length": 272.4375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08448371384292841, + "epoch": 2.496, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12150036543607712, + "learning_rate": 1.0358962026345824e-06, + "loss": 0.0041, + "num_tokens": 10878657.0, + "reward": 13.807670593261719, + "reward_std": 0.5155578255653381, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7785651683807373, + "rewards/length2tails_reward/std": 0.22352470457553864, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.0625, + "completions/mean_terminated_length": 272.0625, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "entropy": 0.0876282462850213, + "epoch": 2.498, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10053152590990067, + "learning_rate": 1.0346147171597535e-06, + "loss": 0.0021, + "num_tokens": 10887395.0, + "reward": 13.880105972290039, + "reward_std": 0.3773041367530823, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8304663896560669, + "rewards/length2tails_reward/std": 0.21791306138038635, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.0, + "completions/mean_terminated_length": 272.0, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "entropy": 0.07947520073503256, + "epoch": 2.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15748527646064758, + "learning_rate": 1.0333331747697196e-06, + "loss": 0.0053, + "num_tokens": 10896131.0, + "reward": 12.376877784729004, + "reward_std": 5.173625469207764, + "rewards/fitness_reward/mean": 6.897719383239746, + "rewards/fitness_reward/std": 2.001847982406616, + "rewards/kidney_reward/mean": 2.1834561824798584, + "rewards/kidney_reward/std": 1.338996410369873, + "rewards/length2tails_reward/mean": 0.779033362865448, + "rewards/length2tails_reward/std": 0.30837929248809814, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1177992820739746, + "rewards/thermo_reward/std": 2.124000310897827, + "step": 1250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 272.78125, + "completions/mean_terminated_length": 272.78125, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.09179925918579102, + "epoch": 2.502, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1123429536819458, + "learning_rate": 1.0320515775716554e-06, + "loss": -0.0004, + "num_tokens": 10904892.0, + "reward": 13.492806434631348, + "reward_std": 1.2470368146896362, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.855421245098114, + "rewards/length2tails_reward/std": 0.122994564473629, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3743178844451904, + "rewards/thermo_reward/std": 1.082673192024231, + "step": 1251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.0625, + "completions/mean_terminated_length": 273.0625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.0898106126114726, + "epoch": 2.504, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.130186066031456, + "learning_rate": 1.0307699276728248e-06, + "loss": -0.0052, + "num_tokens": 10913662.0, + "reward": 13.603492736816406, + "reward_std": 1.795384407043457, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.512298583984375, + "rewards/kidney_reward/std": 0.6459053158760071, + "rewards/length2tails_reward/mean": 0.7948645353317261, + "rewards/length2tails_reward/std": 0.2649412453174591, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.550522804260254, + "rewards/thermo_reward/std": 1.1532329320907593, + "step": 1252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.09375, + "completions/mean_terminated_length": 272.09375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09738536924123764, + "epoch": 2.5060000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12294717133045197, + "learning_rate": 1.0294882271805798e-06, + "loss": 0.0041, + "num_tokens": 10922401.0, + "reward": 13.470455169677734, + "reward_std": 1.1227214336395264, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7700117826461792, + "rewards/length2tails_reward/std": 0.24589170515537262, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3605072498321533, + "rewards/thermo_reward/std": 0.9652195572853088, + "step": 1253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.53125, + "completions/mean_terminated_length": 273.53125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09221231192350388, + "epoch": 2.508, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10390887409448624, + "learning_rate": 1.0282064782023544e-06, + "loss": 0.0041, + "num_tokens": 10931186.0, + "reward": 13.456817626953125, + "reward_std": 2.2660326957702637, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.4679338932037354, + "rewards/kidney_reward/std": 0.7531667947769165, + "rewards/length2tails_reward/mean": 0.8385056257247925, + "rewards/length2tails_reward/std": 0.21479752659797668, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.501357316970825, + "rewards/thermo_reward/std": 1.2150810956954956, + "step": 1254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 644.0, + "completions/max_terminated_length": 644.0, + "completions/mean_length": 282.875, + "completions/mean_terminated_length": 282.875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0956075107678771, + "epoch": 2.51, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19084641337394714, + "learning_rate": 1.026924682845663e-06, + "loss": -0.0193, + "num_tokens": 10940270.0, + "reward": 13.905851364135742, + "reward_std": 0.32790374755859375, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.6890615224838257, + "rewards/length2tails_reward/std": 0.3036077320575714, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.71875, + "completions/mean_terminated_length": 272.71875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08849367219954729, + "epoch": 2.512, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09959346055984497, + "learning_rate": 1.0256428432180954e-06, + "loss": 0.0044, + "num_tokens": 10949029.0, + "reward": 13.916118621826172, + "reward_std": 0.3095528781414032, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7917271852493286, + "rewards/length2tails_reward/std": 0.2443895936012268, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.8125, + "completions/mean_terminated_length": 271.8125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09578220918774605, + "epoch": 2.5140000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06927625834941864, + "learning_rate": 1.0243609614273155e-06, + "loss": -0.0016, + "num_tokens": 10957759.0, + "reward": 13.563721656799316, + "reward_std": 1.1785898208618164, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.755905032157898, + "rewards/length2tails_reward/std": 0.2580097019672394, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.427825450897217, + "rewards/thermo_reward/std": 1.1418836116790771, + "step": 1257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.46875, + "completions/mean_terminated_length": 271.46875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08337361365556717, + "epoch": 2.516, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1192711740732193, + "learning_rate": 1.0230790395810554e-06, + "loss": -0.0049, + "num_tokens": 10966478.0, + "reward": 13.7611665725708, + "reward_std": 0.8724231123924255, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.6928871273994446, + "rewards/length2tails_reward/std": 0.27702251076698303, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.604213237762451, + "rewards/thermo_reward/std": 0.8649076819419861, + "step": 1258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.0807227585464716, + "epoch": 2.518, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0811200961470604, + "learning_rate": 1.0217970797871138e-06, + "loss": -0.0054, + "num_tokens": 10975209.0, + "reward": 12.884984970092773, + "reward_std": 3.795445442199707, + "rewards/fitness_reward/mean": 6.971607208251953, + "rewards/fitness_reward/std": 1.8960589170455933, + "rewards/kidney_reward/mean": 2.390123128890991, + "rewards/kidney_reward/std": 0.8301148414611816, + "rewards/length2tails_reward/mean": 0.7773324847221375, + "rewards/length2tails_reward/std": 0.2632235586643219, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3455216884613037, + "rewards/thermo_reward/std": 1.3048064708709717, + "step": 1259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.1875, + "completions/mean_terminated_length": 273.1875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.0955256512388587, + "epoch": 2.52, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10693230479955673, + "learning_rate": 1.0205150841533512e-06, + "loss": 0.0026, + "num_tokens": 10983983.0, + "reward": 13.840425491333008, + "reward_std": 0.42499473690986633, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8325204849243164, + "rewards/length2tails_reward/std": 0.2075044959783554, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 271.0, + "completions/mean_terminated_length": 271.0, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08668213989585638, + "epoch": 2.5220000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06507860124111176, + "learning_rate": 1.019233054787687e-06, + "loss": -0.0003, + "num_tokens": 10992687.0, + "reward": 13.750646591186523, + "reward_std": 0.9315276145935059, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7100703120231628, + "rewards/length2tails_reward/std": 0.3212975859642029, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5919742584228516, + "rewards/thermo_reward/std": 0.9297705888748169, + "step": 1261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.3125, + "completions/mean_terminated_length": 272.3125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09384338092058897, + "epoch": 2.524, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07486650347709656, + "learning_rate": 1.0179509937980971e-06, + "loss": -0.0034, + "num_tokens": 11001433.0, + "reward": 13.481481552124023, + "reward_std": 1.5638045072555542, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7633057832717896, + "rewards/length2tails_reward/std": 0.27240240573883057, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3722052574157715, + "rewards/thermo_reward/std": 1.3520067930221558, + "step": 1262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.84375, + "completions/mean_terminated_length": 272.84375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0895926021039486, + "epoch": 2.526, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08937916904687881, + "learning_rate": 1.0166689032926083e-06, + "loss": -0.0036, + "num_tokens": 11010196.0, + "reward": 13.441423416137695, + "reward_std": 1.288163661956787, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7962145805358887, + "rewards/length2tails_reward/std": 0.2610096335411072, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.328855514526367, + "rewards/thermo_reward/std": 1.1160696744918823, + "step": 1263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.625, + "completions/mean_terminated_length": 273.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08504704385995865, + "epoch": 2.528, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1109505146741867, + "learning_rate": 1.0153867853792966e-06, + "loss": -0.0011, + "num_tokens": 11018984.0, + "reward": 13.530813217163086, + "reward_std": 1.5682084560394287, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5330631732940674, + "rewards/kidney_reward/std": 0.5284431576728821, + "rewards/length2tails_reward/mean": 0.8431212306022644, + "rewards/length2tails_reward/std": 0.22412942349910736, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4522523880004883, + "rewards/thermo_reward/std": 1.0748932361602783, + "step": 1264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.15625, + "completions/mean_terminated_length": 272.15625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08961287420243025, + "epoch": 2.5300000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3546622693538666, + "learning_rate": 1.014104642166282e-06, + "loss": 0.0012, + "num_tokens": 11027725.0, + "reward": 12.969375610351562, + "reward_std": 4.262889862060547, + "rewards/fitness_reward/mean": 7.0060319900512695, + "rewards/fitness_reward/std": 2.0090479850769043, + "rewards/kidney_reward/mean": 2.4171500205993652, + "rewards/kidney_reward/std": 1.0360020399093628, + "rewards/length2tails_reward/mean": 0.7523579597473145, + "rewards/length2tails_reward/std": 0.25718116760253906, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.370958089828491, + "rewards/thermo_reward/std": 1.2960481643676758, + "step": 1265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 274.59375, + "completions/mean_terminated_length": 274.59375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09319291729480028, + "epoch": 2.532, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06620564311742783, + "learning_rate": 1.0128224757617272e-06, + "loss": -0.0063, + "num_tokens": 11036544.0, + "reward": 12.781671524047852, + "reward_std": 3.350013256072998, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.457012176513672, + "rewards/kidney_reward/std": 0.5447914004325867, + "rewards/length2tails_reward/mean": 0.8345171213150024, + "rewards/length2tails_reward/std": 0.26822420954704285, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0881545543670654, + "rewards/thermo_reward/std": 1.6181961297988892, + "step": 1266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.625, + "completions/mean_terminated_length": 273.625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08962809201329947, + "epoch": 2.534, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11474078893661499, + "learning_rate": 1.0115402882738333e-06, + "loss": -0.0023, + "num_tokens": 11045332.0, + "reward": 13.674827575683594, + "reward_std": 0.8290287256240845, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8229432106018066, + "rewards/length2tails_reward/std": 0.2556893229484558, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 1267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.1875, + "completions/mean_terminated_length": 270.1875, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "entropy": 0.09624328929930925, + "epoch": 2.536, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09320349991321564, + "learning_rate": 1.0102580818108345e-06, + "loss": 0.0033, + "num_tokens": 11054010.0, + "reward": 13.636301040649414, + "reward_std": 1.0296601057052612, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7002662420272827, + "rewards/length2tails_reward/std": 0.33832138776779175, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5059685707092285, + "rewards/thermo_reward/std": 0.9929302930831909, + "step": 1268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.375, + "completions/mean_terminated_length": 273.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09368395432829857, + "epoch": 2.5380000000000003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1153569370508194, + "learning_rate": 1.0089758584809977e-06, + "loss": -0.0002, + "num_tokens": 11062790.0, + "reward": 13.67737102508545, + "reward_std": 0.5592531561851501, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7974194288253784, + "rewards/length2tails_reward/std": 0.2665257453918457, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5099644660949707, + "rewards/thermo_reward/std": 0.5615194439888, + "step": 1269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.0625, + "completions/mean_terminated_length": 273.0625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09078614320605993, + "epoch": 2.54, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10006042569875717, + "learning_rate": 1.0076936203926172e-06, + "loss": 0.0046, + "num_tokens": 11071560.0, + "reward": 13.719049453735352, + "reward_std": 0.5301154255867004, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8153437972068787, + "rewards/length2tails_reward/std": 0.2255038619041443, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5498504638671875, + "rewards/thermo_reward/std": 0.5360844731330872, + "step": 1270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.03125, + "completions/mean_terminated_length": 272.03125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08808152377605438, + "epoch": 2.542, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09218467026948929, + "learning_rate": 1.0064113696540111e-06, + "loss": -0.0015, + "num_tokens": 11080297.0, + "reward": 13.561868667602539, + "reward_std": 1.4590805768966675, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7816716432571411, + "rewards/length2tails_reward/std": 0.22780533134937286, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.450754165649414, + "rewards/thermo_reward/std": 1.281867265701294, + "step": 1271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.6875, + "completions/mean_terminated_length": 273.6875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09965451620519161, + "epoch": 2.544, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08999935537576675, + "learning_rate": 1.0051291083735183e-06, + "loss": -0.0012, + "num_tokens": 11089087.0, + "reward": 13.493375778198242, + "reward_std": 0.9772318601608276, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.842260479927063, + "rewards/length2tails_reward/std": 0.22625687718391418, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3762035369873047, + "rewards/thermo_reward/std": 0.8942775130271912, + "step": 1272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.1875, + "completions/mean_terminated_length": 272.1875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09344311617314816, + "epoch": 2.5460000000000003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14794650673866272, + "learning_rate": 1.0038468386594957e-06, + "loss": 0.0025, + "num_tokens": 11097829.0, + "reward": 13.48156452178955, + "reward_std": 1.4521809816360474, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7624658346176147, + "rewards/length2tails_reward/std": 0.25733861327171326, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.345012664794922, + "rewards/thermo_reward/std": 1.326688289642334, + "step": 1273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "entropy": 0.09371494967490435, + "epoch": 2.548, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0943506732583046, + "learning_rate": 1.0025645626203135e-06, + "loss": 0.0035, + "num_tokens": 11106593.0, + "reward": 12.683286666870117, + "reward_std": 4.46260404586792, + "rewards/fitness_reward/mean": 6.967517852783203, + "rewards/fitness_reward/std": 1.9188534021377563, + "rewards/kidney_reward/mean": 2.366368055343628, + "rewards/kidney_reward/std": 1.0303393602371216, + "rewards/length2tails_reward/mean": 0.8446621894836426, + "rewards/length2tails_reward/std": 0.20369280874729156, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1649341583251953, + "rewards/thermo_reward/std": 1.7017467021942139, + "step": 1274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.625, + "completions/mean_terminated_length": 272.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09274380747228861, + "epoch": 2.55, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0775628462433815, + "learning_rate": 1.0012822823643522e-06, + "loss": -0.0063, + "num_tokens": 11115349.0, + "reward": 13.795833587646484, + "reward_std": 0.4870997369289398, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7854651212692261, + "rewards/length2tails_reward/std": 0.28153514862060547, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 274.0625, + "completions/mean_terminated_length": 274.0625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09395697340369225, + "epoch": 2.552, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5975285768508911, + "learning_rate": 1e-06, + "loss": -0.0071, + "num_tokens": 11124151.0, + "reward": 13.259191513061523, + "reward_std": 2.373781442642212, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.4164295196533203, + "rewards/kidney_reward/std": 0.8949340581893921, + "rewards/length2tails_reward/mean": 0.877596378326416, + "rewards/length2tails_reward/std": 0.1836080551147461, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3513259887695312, + "rewards/thermo_reward/std": 1.1975548267364502, + "step": 1276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.28125, + "completions/mean_terminated_length": 273.28125, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.09055192023515701, + "epoch": 2.5540000000000003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09261415898799896, + "learning_rate": 9.987177176356477e-07, + "loss": 0.0016, + "num_tokens": 11132928.0, + "reward": 13.560474395751953, + "reward_std": 1.0087324380874634, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8390970826148987, + "rewards/length2tails_reward/std": 0.1651468575000763, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4162583351135254, + "rewards/thermo_reward/std": 0.8840147256851196, + "step": 1277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.15625, + "completions/mean_terminated_length": 273.15625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09465023316442966, + "epoch": 2.556, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1217871680855751, + "learning_rate": 9.974354373796866e-07, + "loss": 0.0025, + "num_tokens": 11141701.0, + "reward": 13.690476417541504, + "reward_std": 0.6046956181526184, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8032035827636719, + "rewards/length2tails_reward/std": 0.23792394995689392, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5498507022857666, + "rewards/thermo_reward/std": 0.5360844731330872, + "step": 1278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.5, + "completions/mean_terminated_length": 272.5, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0930365202948451, + "epoch": 2.558, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10494538396596909, + "learning_rate": 9.961531613405042e-07, + "loss": -0.0046, + "num_tokens": 11150453.0, + "reward": 13.534183502197266, + "reward_std": 1.6083168983459473, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7721899151802063, + "rewards/length2tails_reward/std": 0.2945183515548706, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4240174293518066, + "rewards/thermo_reward/std": 1.4253337383270264, + "step": 1279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 271.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09113780409097672, + "epoch": 2.56, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10989918559789658, + "learning_rate": 9.948708916264816e-07, + "loss": -0.0034, + "num_tokens": 11159181.0, + "reward": 13.42755126953125, + "reward_std": 1.8706536293029785, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5374107360839844, + "rewards/kidney_reward/std": 0.503849983215332, + "rewards/length2tails_reward/mean": 0.7447144389152527, + "rewards/length2tails_reward/std": 0.2845161557197571, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3544845581054688, + "rewards/thermo_reward/std": 1.3823689222335815, + "step": 1280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 273.0625, + "completions/mean_terminated_length": 273.0625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09468003083020449, + "epoch": 2.5620000000000003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09075962752103806, + "learning_rate": 9.935886303459888e-07, + "loss": 0.0029, + "num_tokens": 11167951.0, + "reward": 13.570971488952637, + "reward_std": 1.215386986732483, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8060849905014038, + "rewards/length2tails_reward/std": 0.2591499388217926, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4574167728424072, + "rewards/thermo_reward/std": 1.0486905574798584, + "step": 1281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08730126172304153, + "epoch": 2.564, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10445868223905563, + "learning_rate": 9.923063796073825e-07, + "loss": -0.0011, + "num_tokens": 11176699.0, + "reward": 12.568877220153809, + "reward_std": 5.231773376464844, + "rewards/fitness_reward/mean": 6.651214122772217, + "rewards/fitness_reward/std": 2.7942206859588623, + "rewards/kidney_reward/mean": 2.4094700813293457, + "rewards/kidney_reward/std": 1.0789785385131836, + "rewards/length2tails_reward/mean": 0.7707823514938354, + "rewards/length2tails_reward/std": 0.26711055636405945, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3311150074005127, + "rewards/thermo_reward/std": 1.571403980255127, + "step": 1282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.625, + "completions/mean_terminated_length": 272.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0821879762224853, + "epoch": 2.566, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14877688884735107, + "learning_rate": 9.91024141519002e-07, + "loss": 0.0087, + "num_tokens": 11185455.0, + "reward": 13.497551918029785, + "reward_std": 1.3498157262802124, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7818381190299988, + "rewards/length2tails_reward/std": 0.25987908244132996, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3864216804504395, + "rewards/thermo_reward/std": 1.1549537181854248, + "step": 1283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.9375, + "completions/mean_terminated_length": 271.9375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09241542033851147, + "epoch": 2.568, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22466203570365906, + "learning_rate": 9.897419181891654e-07, + "loss": -0.0002, + "num_tokens": 11194189.0, + "reward": 13.665223121643066, + "reward_std": 1.5177754163742065, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.495814561843872, + "rewards/kidney_reward/std": 0.5996876955032349, + "rewards/length2tails_reward/mean": 0.7762358784675598, + "rewards/length2tails_reward/std": 0.24603904783725739, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.6305999755859375, + "rewards/thermo_reward/std": 0.917843759059906, + "step": 1284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.875, + "completions/mean_terminated_length": 271.875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08896076306700706, + "epoch": 2.57, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08401848375797272, + "learning_rate": 9.884597117261666e-07, + "loss": -0.0019, + "num_tokens": 11202921.0, + "reward": 13.83401870727539, + "reward_std": 0.4327254891395569, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7684556245803833, + "rewards/length2tails_reward/std": 0.253930002450943, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 271.3125, + "completions/mean_terminated_length": 271.3125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09406418073922396, + "epoch": 2.572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10037721693515778, + "learning_rate": 9.871775242382725e-07, + "loss": 0.0042, + "num_tokens": 11211635.0, + "reward": 13.790679931640625, + "reward_std": 0.4637486934661865, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7339187860488892, + "rewards/length2tails_reward/std": 0.26712721586227417, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08917554654181004, + "epoch": 2.574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1197580024600029, + "learning_rate": 9.85895357833718e-07, + "loss": 0.0004, + "num_tokens": 11220366.0, + "reward": 12.90008544921875, + "reward_std": 4.444961071014404, + "rewards/fitness_reward/mean": 6.947967052459717, + "rewards/fitness_reward/std": 2.0279359817504883, + "rewards/kidney_reward/mean": 2.416769504547119, + "rewards/kidney_reward/std": 1.0381296873092651, + "rewards/length2tails_reward/mean": 0.7331525087356567, + "rewards/length2tails_reward/std": 0.3075205981731415, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.362034320831299, + "rewards/thermo_reward/std": 1.5685944557189941, + "step": 1287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.6875, + "completions/mean_terminated_length": 271.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.07669658865779638, + "epoch": 2.576, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2233160138130188, + "learning_rate": 9.846132146207038e-07, + "loss": -0.008, + "num_tokens": 11229092.0, + "reward": 12.308110237121582, + "reward_std": 4.719709396362305, + "rewards/fitness_reward/mean": 6.687413215637207, + "rewards/fitness_reward/std": 2.4316930770874023, + "rewards/kidney_reward/mean": 2.266258478164673, + "rewards/kidney_reward/std": 1.1048941612243652, + "rewards/length2tails_reward/mean": 0.6728297472000122, + "rewards/length2tails_reward/std": 0.375557005405426, + "rewards/repeated_in_batch_reward/mean": 0.9375, + "rewards/repeated_in_batch_reward/std": 0.24593468010425568, + "rewards/thermo_reward/mean": 3.193406105041504, + "rewards/thermo_reward/std": 1.554321527481079, + "step": 1288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.0, + "completions/mean_terminated_length": 273.0, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.0911758104339242, + "epoch": 2.578, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07548670470714569, + "learning_rate": 9.833310967073918e-07, + "loss": -0.0013, + "num_tokens": 11237860.0, + "reward": 13.564449310302734, + "reward_std": 1.4019521474838257, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8588228225708008, + "rewards/length2tails_reward/std": 0.21809880435466766, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.418262481689453, + "rewards/thermo_reward/std": 1.2506855726242065, + "step": 1289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.1875, + "completions/mean_terminated_length": 269.1875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.09383352566510439, + "epoch": 2.58, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6485643982887268, + "learning_rate": 9.82049006201903e-07, + "loss": -0.0941, + "num_tokens": 11246506.0, + "reward": 12.823418617248535, + "reward_std": 4.95705509185791, + "rewards/fitness_reward/mean": 6.976078033447266, + "rewards/fitness_reward/std": 2.1784961223602295, + "rewards/kidney_reward/mean": 2.378368854522705, + "rewards/kidney_reward/std": 1.2533495426177979, + "rewards/length2tails_reward/mean": 0.8960451483726501, + "rewards/length2tails_reward/std": 0.13475503027439117, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2793679237365723, + "rewards/thermo_reward/std": 1.744246244430542, + "step": 1290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.71875, + "completions/mean_terminated_length": 273.71875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08943956810981035, + "epoch": 2.582, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11884415149688721, + "learning_rate": 9.807669452123129e-07, + "loss": 0.0007, + "num_tokens": 11255297.0, + "reward": 13.767016410827637, + "reward_std": 1.1081784963607788, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8364322781562805, + "rewards/length2tails_reward/std": 0.21595613658428192, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.6230673789978027, + "rewards/thermo_reward/std": 0.9592058062553406, + "step": 1291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.21875, + "completions/mean_terminated_length": 273.21875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.09474212862551212, + "epoch": 2.584, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08420742303133011, + "learning_rate": 9.794849158466492e-07, + "loss": -0.0018, + "num_tokens": 11264072.0, + "reward": 13.596220970153809, + "reward_std": 1.322066307067871, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8675425052642822, + "rewards/length2tails_reward/std": 0.1385192573070526, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.476520538330078, + "rewards/thermo_reward/std": 1.1455049514770508, + "step": 1292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.59375, + "completions/mean_terminated_length": 272.59375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08064570464193821, + "epoch": 2.586, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08840952068567276, + "learning_rate": 9.782029202128863e-07, + "loss": 0.0042, + "num_tokens": 11272827.0, + "reward": 13.258116722106934, + "reward_std": 2.666351079940796, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.650642991065979, + "rewards/kidney_reward/mean": 2.459886074066162, + "rewards/kidney_reward/std": 0.7977776527404785, + "rewards/length2tails_reward/mean": 0.7825390100479126, + "rewards/length2tails_reward/std": 0.2645307183265686, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.373810291290283, + "rewards/thermo_reward/std": 1.3559972047805786, + "step": 1293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 271.4375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08870595414191484, + "epoch": 2.588, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10983943194150925, + "learning_rate": 9.769209604189447e-07, + "loss": -0.006, + "num_tokens": 11281545.0, + "reward": 13.830974578857422, + "reward_std": 0.43895232677459717, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7380162477493286, + "rewards/length2tails_reward/std": 0.2492593377828598, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 627.0, + "completions/max_terminated_length": 627.0, + "completions/mean_length": 282.59375, + "completions/mean_terminated_length": 282.59375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09890826977789402, + "epoch": 2.59, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.069254994392395, + "learning_rate": 9.756390385726847e-07, + "loss": -0.0119, + "num_tokens": 11290620.0, + "reward": 13.273027420043945, + "reward_std": 2.208266496658325, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.453822135925293, + "rewards/kidney_reward/std": 0.6914807558059692, + "rewards/length2tails_reward/mean": 0.7368823885917664, + "rewards/length2tails_reward/std": 0.28326600790023804, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.341841220855713, + "rewards/thermo_reward/std": 1.2458171844482422, + "step": 1295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.78125, + "completions/mean_terminated_length": 271.78125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08741918485611677, + "epoch": 2.592, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08522574603557587, + "learning_rate": 9.743571567819045e-07, + "loss": -0.0004, + "num_tokens": 11299349.0, + "reward": 13.694768905639648, + "reward_std": 0.9248803853988647, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.6994171142578125, + "rewards/length2tails_reward/std": 0.281656950712204, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5645222663879395, + "rewards/thermo_reward/std": 0.8824394345283508, + "step": 1296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 274.625, + "completions/mean_terminated_length": 274.625, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "entropy": 0.09598539769649506, + "epoch": 2.594, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09700886160135269, + "learning_rate": 9.730753171543374e-07, + "loss": 0.0012, + "num_tokens": 11308169.0, + "reward": 13.684208869934082, + "reward_std": 0.8987782001495361, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.9033083319664001, + "rewards/length2tails_reward/std": 0.13247719407081604, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5335729122161865, + "rewards/thermo_reward/std": 0.8546720743179321, + "step": 1297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.3125, + "completions/mean_terminated_length": 272.3125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09415233507752419, + "epoch": 2.596, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19626864790916443, + "learning_rate": 9.717935217976457e-07, + "loss": -0.0005, + "num_tokens": 11316915.0, + "reward": 13.699674606323242, + "reward_std": 0.6537702679634094, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.769914984703064, + "rewards/length2tails_reward/std": 0.2710944712162018, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 1298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.5625, + "completions/mean_terminated_length": 272.5625, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.09926941432058811, + "epoch": 2.598, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1721256971359253, + "learning_rate": 9.7051177281942e-07, + "loss": -0.0007, + "num_tokens": 11325669.0, + "reward": 13.715205192565918, + "reward_std": 0.5400824546813965, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7769005298614502, + "rewards/length2tails_reward/std": 0.29632213711738586, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5498504638671875, + "rewards/thermo_reward/std": 0.5360844731330872, + "step": 1299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.0625, + "completions/mean_terminated_length": 272.0625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09105388261377811, + "epoch": 2.6, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09366677701473236, + "learning_rate": 9.692300723271752e-07, + "loss": -0.0025, + "num_tokens": 11334407.0, + "reward": 12.53223705291748, + "reward_std": 4.189420223236084, + "rewards/fitness_reward/mean": 6.938035011291504, + "rewards/fitness_reward/std": 1.840762972831726, + "rewards/kidney_reward/mean": 2.2998170852661133, + "rewards/kidney_reward/std": 1.048743724822998, + "rewards/length2tails_reward/mean": 0.7610618472099304, + "rewards/length2tails_reward/std": 0.30591100454330444, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1182785034179688, + "rewards/thermo_reward/std": 1.7776644229888916, + "step": 1300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.1875, + "completions/mean_terminated_length": 273.1875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09142200648784637, + "epoch": 2.602, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10108436644077301, + "learning_rate": 9.679484224283447e-07, + "loss": -0.0024, + "num_tokens": 11343181.0, + "reward": 13.734735488891602, + "reward_std": 0.5882219076156616, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8469273447990417, + "rewards/length2tails_reward/std": 0.2233157604932785, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897367000579834, + "rewards/thermo_reward/std": 0.5061467885971069, + "step": 1301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.375, + "completions/mean_terminated_length": 270.375, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "entropy": 0.09068982116878033, + "epoch": 2.604, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15013930201530457, + "learning_rate": 9.666668252302806e-07, + "loss": 0.0007, + "num_tokens": 11351865.0, + "reward": 12.603252410888672, + "reward_std": 5.028195381164551, + "rewards/fitness_reward/mean": 6.663951873779297, + "rewards/fitness_reward/std": 2.751307964324951, + "rewards/kidney_reward/mean": 2.4275808334350586, + "rewards/kidney_reward/std": 0.9776963591575623, + "rewards/length2tails_reward/mean": 0.7178422808647156, + "rewards/length2tails_reward/std": 0.3161407709121704, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.339935779571533, + "rewards/thermo_reward/std": 1.525055170059204, + "step": 1302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.8125, + "completions/mean_terminated_length": 272.8125, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.08908124640583992, + "epoch": 2.606, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09386913478374481, + "learning_rate": 9.653852828402466e-07, + "loss": -0.0004, + "num_tokens": 11360627.0, + "reward": 13.692395210266113, + "reward_std": 0.9287048578262329, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8492799401283264, + "rewards/length2tails_reward/std": 0.2031726986169815, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5198025703430176, + "rewards/thermo_reward/std": 0.9229288101196289, + "step": 1303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08594439877197146, + "epoch": 2.608, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10545476526021957, + "learning_rate": 9.641037973654178e-07, + "loss": -0.003, + "num_tokens": 11369375.0, + "reward": 13.526748657226562, + "reward_std": 1.576993703842163, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.535891532897949, + "rewards/kidney_reward/std": 0.5124428272247314, + "rewards/length2tails_reward/mean": 0.7894778251647949, + "rewards/length2tails_reward/std": 0.26516595482826233, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4507243633270264, + "rewards/thermo_reward/std": 1.1982841491699219, + "step": 1304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09429272636771202, + "epoch": 2.61, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1060292050242424, + "learning_rate": 9.628223709128749e-07, + "loss": 0.0014, + "num_tokens": 11378123.0, + "reward": 13.520185470581055, + "reward_std": 1.6307603120803833, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.78755122423172, + "rewards/length2tails_reward/std": 0.24123510718345642, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.381124258041382, + "rewards/thermo_reward/std": 1.503252625465393, + "step": 1305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 274.59375, + "completions/mean_terminated_length": 274.59375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08381335902959108, + "epoch": 2.612, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1200776994228363, + "learning_rate": 9.615410055896014e-07, + "loss": 0.001, + "num_tokens": 11386942.0, + "reward": 12.965974807739258, + "reward_std": 3.969346761703491, + "rewards/fitness_reward/mean": 7.0461039543151855, + "rewards/fitness_reward/std": 1.7823677062988281, + "rewards/kidney_reward/mean": 2.481417179107666, + "rewards/kidney_reward/std": 0.8205979466438293, + "rewards/length2tails_reward/mean": 0.8177987337112427, + "rewards/length2tails_reward/std": 0.2320345640182495, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2566728591918945, + "rewards/thermo_reward/std": 1.5435835123062134, + "step": 1306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 271.75, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "entropy": 0.08833951037377119, + "epoch": 2.614, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08464069664478302, + "learning_rate": 9.602597035024815e-07, + "loss": -0.004, + "num_tokens": 11395670.0, + "reward": 13.229515075683594, + "reward_std": 1.6974692344665527, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8088982105255127, + "rewards/length2tails_reward/std": 0.26726973056793213, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0883195400238037, + "rewards/thermo_reward/std": 1.6101175546646118, + "step": 1307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.5, + "completions/mean_terminated_length": 271.5, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.09514253214001656, + "epoch": 2.616, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0928158089518547, + "learning_rate": 9.589784667582934e-07, + "loss": -0.0015, + "num_tokens": 11404390.0, + "reward": 13.878170013427734, + "reward_std": 0.3858002722263336, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8111158013343811, + "rewards/length2tails_reward/std": 0.23564399778842926, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 271.875, + "completions/mean_terminated_length": 271.875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.08827656414359808, + "epoch": 2.618, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09488391876220703, + "learning_rate": 9.576972974637097e-07, + "loss": 0.0011, + "num_tokens": 11413122.0, + "reward": 13.837629318237305, + "reward_std": 0.42670488357543945, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8045660853385925, + "rewards/length2tails_reward/std": 0.18262574076652527, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.59375, + "completions/mean_terminated_length": 272.59375, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.0957744549959898, + "epoch": 2.62, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.283891499042511, + "learning_rate": 9.564161977252915e-07, + "loss": 0.0002, + "num_tokens": 11421877.0, + "reward": 13.834540367126465, + "reward_std": 0.43187758326530457, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.773668646812439, + "rewards/length2tails_reward/std": 0.24948285520076752, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.21875, + "completions/mean_terminated_length": 272.21875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08740524481981993, + "epoch": 2.622, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08296260982751846, + "learning_rate": 9.551351696494853e-07, + "loss": -0.0021, + "num_tokens": 11430620.0, + "reward": 13.915056228637695, + "reward_std": 0.31537458300590515, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7811083793640137, + "rewards/length2tails_reward/std": 0.24637141823768616, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.34375, + "completions/mean_terminated_length": 272.34375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09013056010007858, + "epoch": 2.624, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1346326768398285, + "learning_rate": 9.538542153426195e-07, + "loss": -0.006, + "num_tokens": 11439367.0, + "reward": 13.187000274658203, + "reward_std": 2.221527099609375, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.3923935890197754, + "rewards/kidney_reward/std": 0.8301119208335876, + "rewards/length2tails_reward/mean": 0.7555396556854248, + "rewards/length2tails_reward/std": 0.29163092374801636, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3153765201568604, + "rewards/thermo_reward/std": 1.1827013492584229, + "step": 1312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08751111663877964, + "epoch": 2.626, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08335331082344055, + "learning_rate": 9.525733369109017e-07, + "loss": -0.0019, + "num_tokens": 11448102.0, + "reward": 13.484519004821777, + "reward_std": 1.0472941398620605, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7206494808197021, + "rewards/length2tails_reward/std": 0.29937759041786194, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3521485328674316, + "rewards/thermo_reward/std": 1.0041559934616089, + "step": 1313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0922544403001666, + "epoch": 2.628, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.058741435408592224, + "learning_rate": 9.512925364604151e-07, + "loss": -0.0056, + "num_tokens": 11456850.0, + "reward": 13.344647407531738, + "reward_std": 3.008349657058716, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.5390896797180176, + "rewards/kidney_reward/std": 0.49435171484947205, + "rewards/length2tails_reward/mean": 0.7650218605995178, + "rewards/length2tails_reward/std": 0.2936790883541107, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5760021209716797, + "rewards/thermo_reward/std": 0.8242037296295166, + "step": 1314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.21875, + "completions/mean_terminated_length": 273.21875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.0923885004594922, + "epoch": 2.63, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08085627108812332, + "learning_rate": 9.500118160971138e-07, + "loss": -0.0054, + "num_tokens": 11465625.0, + "reward": 13.880472183227539, + "reward_std": 0.38467854261398315, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8341273069381714, + "rewards/length2tails_reward/std": 0.19471846520900726, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.59375, + "completions/mean_terminated_length": 271.59375, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "entropy": 0.10310139693319798, + "epoch": 2.632, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07032930105924606, + "learning_rate": 9.487311779268209e-07, + "loss": -0.0031, + "num_tokens": 11474348.0, + "reward": 13.608085632324219, + "reward_std": 1.227725625038147, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7867770791053772, + "rewards/length2tails_reward/std": 0.22749637067317963, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4964609146118164, + "rewards/thermo_reward/std": 1.0417221784591675, + "step": 1316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.34375, + "completions/mean_terminated_length": 272.34375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0867899302393198, + "epoch": 2.634, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09172943234443665, + "learning_rate": 9.474506240552238e-07, + "loss": -0.0067, + "num_tokens": 11483095.0, + "reward": 13.61172103881836, + "reward_std": 1.7407996654510498, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.537740707397461, + "rewards/kidney_reward/std": 0.5019829869270325, + "rewards/length2tails_reward/mean": 0.7726287841796875, + "rewards/length2tails_reward/std": 0.2952404320240021, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.535531759262085, + "rewards/thermo_reward/std": 1.235055685043335, + "step": 1317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 272.34375, + "completions/mean_terminated_length": 272.34375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.0941666942089796, + "epoch": 2.636, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09857843071222305, + "learning_rate": 9.461701565878718e-07, + "loss": -0.003, + "num_tokens": 11491842.0, + "reward": 13.678802490234375, + "reward_std": 0.564246416091919, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8117349147796631, + "rewards/length2tails_reward/std": 0.22255975008010864, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.50996470451355, + "rewards/thermo_reward/std": 0.5615194439888, + "step": 1318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.40625, + "completions/mean_terminated_length": 272.40625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09366055484861135, + "epoch": 2.638, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11917433142662048, + "learning_rate": 9.448897776301721e-07, + "loss": -0.0, + "num_tokens": 11500591.0, + "reward": 13.591472625732422, + "reward_std": 1.2057130336761475, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7693405747413635, + "rewards/length2tails_reward/std": 0.2404543161392212, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4542322158813477, + "rewards/thermo_reward/std": 1.0648303031921387, + "step": 1319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.0, + "completions/mean_terminated_length": 273.0, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.08513898495584726, + "epoch": 2.64, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1232292652130127, + "learning_rate": 9.436094892873857e-07, + "loss": 0.001, + "num_tokens": 11509359.0, + "reward": 13.616416931152344, + "reward_std": 1.497071385383606, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5254647731781006, + "rewards/kidney_reward/std": 0.5714265704154968, + "rewards/length2tails_reward/mean": 0.8009185791015625, + "rewards/length2tails_reward/std": 0.28200775384902954, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.549675941467285, + "rewards/thermo_reward/std": 0.9590282440185547, + "step": 1320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.53125, + "completions/mean_terminated_length": 270.53125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08928291499614716, + "epoch": 2.642, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0923282653093338, + "learning_rate": 9.423292936646257e-07, + "loss": 0.002, + "num_tokens": 11518048.0, + "reward": 13.902721405029297, + "reward_std": 0.30983614921569824, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.6577587127685547, + "rewards/length2tails_reward/std": 0.30942097306251526, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.9375, + "completions/mean_terminated_length": 270.9375, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 0.08833121135830879, + "epoch": 2.644, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06364872306585312, + "learning_rate": 9.410491928668515e-07, + "loss": -0.0039, + "num_tokens": 11526750.0, + "reward": 13.665266036987305, + "reward_std": 1.437191367149353, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5205302238464355, + "rewards/kidney_reward/std": 0.5993408560752869, + "rewards/length2tails_reward/mean": 0.765282154083252, + "rewards/length2tails_reward/std": 0.2492385059595108, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.607022523880005, + "rewards/thermo_reward/std": 0.8501169085502625, + "step": 1322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 272.34375, + "completions/mean_terminated_length": 272.34375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09671840630471706, + "epoch": 2.646, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06736798584461212, + "learning_rate": 9.397691889988674e-07, + "loss": -0.003, + "num_tokens": 11535497.0, + "reward": 13.687711715698242, + "reward_std": 1.0258924961090088, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.812004804611206, + "rewards/length2tails_reward/std": 0.22589294612407684, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5735652446746826, + "rewards/thermo_reward/std": 0.8364829421043396, + "step": 1323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 271.46875, + "completions/mean_terminated_length": 271.46875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.0973320035263896, + "epoch": 2.648, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10905929654836655, + "learning_rate": 9.384892841653187e-07, + "loss": -0.0027, + "num_tokens": 11544216.0, + "reward": 13.089970588684082, + "reward_std": 2.683310031890869, + "rewards/fitness_reward/mean": 7.052721977233887, + "rewards/fitness_reward/std": 1.7449299097061157, + "rewards/kidney_reward/mean": 2.4832568168640137, + "rewards/kidney_reward/std": 0.5357418060302734, + "rewards/length2tails_reward/mean": 0.7538886070251465, + "rewards/length2tails_reward/std": 0.2668653428554535, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3786025047302246, + "rewards/thermo_reward/std": 1.2563246488571167, + "step": 1324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 274.03125, + "completions/mean_terminated_length": 274.03125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09613270545378327, + "epoch": 2.65, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0994558036327362, + "learning_rate": 9.372094804706866e-07, + "loss": -0.0021, + "num_tokens": 11553017.0, + "reward": 13.69377326965332, + "reward_std": 0.6110822558403015, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8361679315567017, + "rewards/length2tails_reward/std": 0.231988325715065, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5498504638671875, + "rewards/thermo_reward/std": 0.5360844731330872, + "step": 1325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.46875, + "completions/mean_terminated_length": 272.46875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08980119600892067, + "epoch": 2.652, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09267576783895493, + "learning_rate": 9.359297800192871e-07, + "loss": -0.0017, + "num_tokens": 11561768.0, + "reward": 13.330853462219238, + "reward_std": 1.7552409172058105, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5369973182678223, + "rewards/kidney_reward/std": 0.5061891078948975, + "rewards/length2tails_reward/mean": 0.7579162120819092, + "rewards/length2tails_reward/std": 0.28038784861564636, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2568795680999756, + "rewards/thermo_reward/std": 1.2792414426803589, + "step": 1326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.46875, + "completions/mean_terminated_length": 271.46875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08606134541332722, + "epoch": 2.654, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08788134902715683, + "learning_rate": 9.346501849152658e-07, + "loss": -0.0094, + "num_tokens": 11570487.0, + "reward": 12.317151069641113, + "reward_std": 4.191816329956055, + "rewards/fitness_reward/mean": 6.938035011291504, + "rewards/fitness_reward/std": 1.840762972831726, + "rewards/kidney_reward/mean": 2.308594226837158, + "rewards/kidney_reward/std": 0.9246311187744141, + "rewards/length2tails_reward/mean": 0.6851747035980225, + "rewards/length2tails_reward/std": 0.3613194525241852, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.902003765106201, + "rewards/thermo_reward/std": 2.1534359455108643, + "step": 1327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0841665007174015, + "epoch": 2.656, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0726706013083458, + "learning_rate": 9.333706972625954e-07, + "loss": -0.0025, + "num_tokens": 11579231.0, + "reward": 13.50656509399414, + "reward_std": 1.1999610662460327, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.7119618654251099, + "rewards/length2tails_reward/std": 0.3201157748699188, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.429781913757324, + "rewards/thermo_reward/std": 0.9979439377784729, + "step": 1328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09338252898305655, + "epoch": 2.658, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14197185635566711, + "learning_rate": 9.320913191650723e-07, + "loss": -0.0002, + "num_tokens": 11587989.0, + "reward": 13.265280723571777, + "reward_std": 2.2200560569763184, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.487063407897949, + "rewards/kidney_reward/std": 0.6476423740386963, + "rewards/length2tails_reward/mean": 0.7995427846908569, + "rewards/length2tails_reward/std": 0.24626636505126953, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2945876121520996, + "rewards/thermo_reward/std": 1.3599261045455933, + "step": 1329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.9375, + "completions/mean_terminated_length": 272.9375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09512456692755222, + "epoch": 2.66, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08194391429424286, + "learning_rate": 9.308120527263116e-07, + "loss": -0.0042, + "num_tokens": 11596755.0, + "reward": 13.631172180175781, + "reward_std": 0.9415711760520935, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8064660429954529, + "rewards/length2tails_reward/std": 0.2341540902853012, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4902195930480957, + "rewards/thermo_reward/std": 0.8866589069366455, + "step": 1330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.3125, + "completions/mean_terminated_length": 271.3125, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "entropy": 0.09813211299479008, + "epoch": 2.662, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10500352084636688, + "learning_rate": 9.295329000497459e-07, + "loss": -0.0067, + "num_tokens": 11605469.0, + "reward": 13.039316177368164, + "reward_std": 2.732551336288452, + "rewards/fitness_reward/mean": 7.0139055252075195, + "rewards/fitness_reward/std": 1.9645084142684937, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8129358887672424, + "rewards/length2tails_reward/std": 0.243895024061203, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.272355556488037, + "rewards/thermo_reward/std": 1.4353864192962646, + "step": 1331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.34375, + "completions/mean_terminated_length": 272.34375, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "entropy": 0.09504935797303915, + "epoch": 2.664, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07366065680980682, + "learning_rate": 9.282538632386206e-07, + "loss": -0.0012, + "num_tokens": 11614216.0, + "reward": 13.701459884643555, + "reward_std": 0.8884310722351074, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8486068844795227, + "rewards/length2tails_reward/std": 0.20267438888549805, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.528935194015503, + "rewards/thermo_reward/std": 0.8774805665016174, + "step": 1332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.34375, + "completions/mean_terminated_length": 272.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0826009577140212, + "epoch": 2.666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08948934078216553, + "learning_rate": 9.269749443959904e-07, + "loss": -0.0028, + "num_tokens": 11622963.0, + "reward": 13.685850143432617, + "reward_std": 1.211296796798706, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7733684182167053, + "rewards/length2tails_reward/std": 0.2745286822319031, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5482077598571777, + "rewards/thermo_reward/std": 1.1658411026000977, + "step": 1333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.25, + "completions/mean_terminated_length": 273.25, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09233411867171526, + "epoch": 2.668, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1923236846923828, + "learning_rate": 9.25696145624715e-07, + "loss": 0.0022, + "num_tokens": 11631739.0, + "reward": 13.02658748626709, + "reward_std": 5.040348529815674, + "rewards/fitness_reward/mean": 6.9716691970825195, + "rewards/fitness_reward/std": 2.2034339904785156, + "rewards/kidney_reward/mean": 2.4078118801116943, + "rewards/kidney_reward/std": 1.2369717359542847, + "rewards/length2tails_reward/mean": 0.8197587728500366, + "rewards/length2tails_reward/std": 0.24441765248775482, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4651312828063965, + "rewards/thermo_reward/std": 1.6234033107757568, + "step": 1334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09106593765318394, + "epoch": 2.67, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05460730567574501, + "learning_rate": 9.244174690274588e-07, + "loss": -0.0042, + "num_tokens": 11640487.0, + "reward": 13.567729949951172, + "reward_std": 1.3579232692718506, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8017369508743286, + "rewards/length2tails_reward/std": 0.19972741603851318, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4272513389587402, + "rewards/thermo_reward/std": 1.2037054300308228, + "step": 1335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.90625, + "completions/mean_terminated_length": 270.90625, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "entropy": 0.09610113222151995, + "epoch": 2.672, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1169869601726532, + "learning_rate": 9.231389167066835e-07, + "loss": -0.0015, + "num_tokens": 11649188.0, + "reward": 13.5916109085083, + "reward_std": 1.0232844352722168, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8028391003608704, + "rewards/length2tails_reward/std": 0.2460554540157318, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4236621856689453, + "rewards/thermo_reward/std": 1.0279661417007446, + "step": 1336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.4375, + "completions/mean_terminated_length": 272.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.07874941732734442, + "epoch": 2.674, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05606454983353615, + "learning_rate": 9.218604907646474e-07, + "loss": -0.0038, + "num_tokens": 11657938.0, + "reward": 13.528667449951172, + "reward_std": 1.756035566329956, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.526620388031006, + "rewards/kidney_reward/std": 0.5648899674415588, + "rewards/length2tails_reward/mean": 0.7580454349517822, + "rewards/length2tails_reward/std": 0.32083970308303833, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.465057611465454, + "rewards/thermo_reward/std": 1.2058982849121094, + "step": 1337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.625, + "completions/mean_terminated_length": 272.625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08984020911157131, + "epoch": 2.676, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19920657575130463, + "learning_rate": 9.205821933034011e-07, + "loss": 0.0046, + "num_tokens": 11666694.0, + "reward": 13.52869987487793, + "reward_std": 2.2076985836029053, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4739794731140137, + "rewards/kidney_reward/std": 0.8626706004142761, + "rewards/length2tails_reward/mean": 0.8132075071334839, + "rewards/length2tails_reward/std": 0.2025136947631836, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5122148990631104, + "rewards/thermo_reward/std": 1.3630452156066895, + "step": 1338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.78125, + "completions/mean_terminated_length": 273.78125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.0930999293923378, + "epoch": 2.678, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08068838715553284, + "learning_rate": 9.193040264247828e-07, + "loss": 0.0001, + "num_tokens": 11675487.0, + "reward": 13.657683372497559, + "reward_std": 1.52280592918396, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8673219680786133, + "rewards/length2tails_reward/std": 0.2095680683851242, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5106451511383057, + "rewards/thermo_reward/std": 1.371688723564148, + "step": 1339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09656371735036373, + "epoch": 2.68, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1636841297149658, + "learning_rate": 9.180259922304174e-07, + "loss": 0.0087, + "num_tokens": 11684218.0, + "reward": 13.57703971862793, + "reward_std": 1.2407475709915161, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4559011459350586, + "rewards/kidney_reward/std": 0.9649369716644287, + "rewards/length2tails_reward/mean": 0.7021640539169312, + "rewards/length2tails_reward/std": 0.3510373830795288, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 1340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.9375, + "completions/mean_terminated_length": 271.9375, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.09234402794390917, + "epoch": 2.682, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12080354243516922, + "learning_rate": 9.167480928217107e-07, + "loss": 0.0034, + "num_tokens": 11692952.0, + "reward": 13.760887145996094, + "reward_std": 0.505234956741333, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8348555564880371, + "rewards/length2tails_reward/std": 0.2311815470457077, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 1341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.0, + "completions/mean_terminated_length": 273.0, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.0820971867069602, + "epoch": 2.684, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07293231785297394, + "learning_rate": 9.154703302998472e-07, + "loss": -0.0028, + "num_tokens": 11701720.0, + "reward": 13.877494812011719, + "reward_std": 0.3817107677459717, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8043537139892578, + "rewards/length2tails_reward/std": 0.22909440100193024, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 271.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0913189435377717, + "epoch": 2.686, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.193898007273674, + "learning_rate": 9.141927067657868e-07, + "loss": 0.0014, + "num_tokens": 11710431.0, + "reward": 12.636367797851562, + "reward_std": 4.728992462158203, + "rewards/fitness_reward/mean": 6.928929328918457, + "rewards/fitness_reward/std": 1.8890396356582642, + "rewards/kidney_reward/mean": 2.3434696197509766, + "rewards/kidney_reward/std": 1.1144951581954956, + "rewards/length2tails_reward/mean": 0.720057487487793, + "rewards/length2tails_reward/std": 0.2935502529144287, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1919631958007812, + "rewards/thermo_reward/std": 1.8892871141433716, + "step": 1343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.90625, + "completions/mean_terminated_length": 272.90625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08928275294601917, + "epoch": 2.6879999999999997, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0986161008477211, + "learning_rate": 9.129152243202596e-07, + "loss": 0.0014, + "num_tokens": 11719196.0, + "reward": 13.701171875, + "reward_std": 1.056113839149475, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8297706842422485, + "rewards/length2tails_reward/std": 0.20739682018756866, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.557888984680176, + "rewards/thermo_reward/std": 0.9164990186691284, + "step": 1344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.0625, + "completions/mean_terminated_length": 273.0625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09937832318246365, + "epoch": 2.69, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11231338977813721, + "learning_rate": 9.116378850637649e-07, + "loss": 0.0024, + "num_tokens": 11727966.0, + "reward": 13.841484069824219, + "reward_std": 0.42608216404914856, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8431087136268616, + "rewards/length2tails_reward/std": 0.208415225148201, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.0625, + "completions/mean_terminated_length": 272.0625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0859296889975667, + "epoch": 2.692, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13398477435112, + "learning_rate": 9.103606910965665e-07, + "loss": 0.0029, + "num_tokens": 11736704.0, + "reward": 13.708622932434082, + "reward_std": 0.8716744184494019, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.736136794090271, + "rewards/length2tails_reward/std": 0.29717540740966797, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5747036933898926, + "rewards/thermo_reward/std": 0.8307410478591919, + "step": 1346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 272.125, + "completions/mean_terminated_length": 272.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09783033747226, + "epoch": 2.694, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19264395534992218, + "learning_rate": 9.090836445186883e-07, + "loss": 0.0033, + "num_tokens": 11745444.0, + "reward": 12.873300552368164, + "reward_std": 3.236984968185425, + "rewards/fitness_reward/mean": 7.188657283782959, + "rewards/fitness_reward/std": 0.7179933190345764, + "rewards/kidney_reward/mean": 2.3760547637939453, + "rewards/kidney_reward/std": 0.8919334411621094, + "rewards/length2tails_reward/mean": 0.7774170637130737, + "rewards/length2tails_reward/std": 0.2432018667459488, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1308465003967285, + "rewards/thermo_reward/std": 1.732054591178894, + "step": 1347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.5625, + "completions/mean_terminated_length": 271.5625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09386403812095523, + "epoch": 2.6959999999999997, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1917669028043747, + "learning_rate": 9.078067474299132e-07, + "loss": 0.0051, + "num_tokens": 11754166.0, + "reward": 13.039304733276367, + "reward_std": 2.959359884262085, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.452592670917511, + "rewards/kidney_reward/mean": 2.3586478233337402, + "rewards/kidney_reward/std": 1.0929458141326904, + "rewards/length2tails_reward/mean": 0.7195242643356323, + "rewards/length2tails_reward/std": 0.28090983629226685, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2625389099121094, + "rewards/thermo_reward/std": 1.4743518829345703, + "step": 1348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.53125, + "completions/mean_terminated_length": 273.53125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08278051018714905, + "epoch": 2.698, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08158128708600998, + "learning_rate": 9.06530001929777e-07, + "loss": -0.0031, + "num_tokens": 11762951.0, + "reward": 13.546849250793457, + "reward_std": 1.4874722957611084, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5375123023986816, + "rewards/kidney_reward/std": 0.503275454044342, + "rewards/length2tails_reward/mean": 0.843279242515564, + "rewards/length2tails_reward/std": 0.21693500876426697, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4638240337371826, + "rewards/thermo_reward/std": 1.0164086818695068, + "step": 1349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.5, + "completions/mean_terminated_length": 273.5, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 0.08491826569661498, + "epoch": 2.7, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07856491208076477, + "learning_rate": 9.05253410117567e-07, + "loss": -0.0062, + "num_tokens": 11771735.0, + "reward": 13.706490516662598, + "reward_std": 0.8723586797714233, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8548135161399841, + "rewards/length2tails_reward/std": 0.2584093511104584, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.533344268798828, + "rewards/thermo_reward/std": 0.8557917475700378, + "step": 1350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 273.21875, + "completions/mean_terminated_length": 273.21875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08885801862925291, + "epoch": 2.702, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10287737846374512, + "learning_rate": 9.039769740923182e-07, + "loss": 0.0056, + "num_tokens": 11780510.0, + "reward": 13.050555229187012, + "reward_std": 4.520473957061768, + "rewards/fitness_reward/mean": 7.021054267883301, + "rewards/fitness_reward/std": 1.924071192741394, + "rewards/kidney_reward/mean": 2.3942604064941406, + "rewards/kidney_reward/std": 1.1641942262649536, + "rewards/length2tails_reward/mean": 0.8088257908821106, + "rewards/length2tails_reward/std": 0.22004175186157227, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4543581008911133, + "rewards/thermo_reward/std": 1.4701238870620728, + "step": 1351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.5, + "completions/mean_terminated_length": 273.5, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09847811609506607, + "epoch": 2.7039999999999997, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23275437951087952, + "learning_rate": 9.027006959528083e-07, + "loss": 0.0021, + "num_tokens": 11789294.0, + "reward": 13.7213773727417, + "reward_std": 0.5335816740989685, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8386157155036926, + "rewards/length2tails_reward/std": 0.21023456752300262, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5498504638671875, + "rewards/thermo_reward/std": 0.5360844731330872, + "step": 1352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.625, + "completions/mean_terminated_length": 272.625, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.09279504232108593, + "epoch": 2.706, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13962967693805695, + "learning_rate": 9.014245777975564e-07, + "loss": 0.007, + "num_tokens": 11798050.0, + "reward": 13.785286903381348, + "reward_std": 0.5953236818313599, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8283153772354126, + "rewards/length2tails_reward/std": 0.20525197684764862, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.9375, + "completions/mean_terminated_length": 270.9375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08352559478953481, + "epoch": 2.708, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1153552308678627, + "learning_rate": 9.00148621724818e-07, + "loss": -0.0036, + "num_tokens": 11806752.0, + "reward": 13.292638778686523, + "reward_std": 3.030071973800659, + "rewards/fitness_reward/mean": 7.052952766418457, + "rewards/fitness_reward/std": 1.7436254024505615, + "rewards/kidney_reward/mean": 2.5388498306274414, + "rewards/kidney_reward/std": 0.49570968747138977, + "rewards/length2tails_reward/mean": 0.6864029765129089, + "rewards/length2tails_reward/std": 0.297585666179657, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.532196044921875, + "rewards/thermo_reward/std": 0.8614235520362854, + "step": 1354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.21875, + "completions/mean_terminated_length": 273.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09010605234652758, + "epoch": 2.71, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12815427780151367, + "learning_rate": 8.988728298325821e-07, + "loss": 0.0031, + "num_tokens": 11815527.0, + "reward": 13.77267074584961, + "reward_std": 0.549005389213562, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8274263739585876, + "rewards/length2tails_reward/std": 0.2377612441778183, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.34375, + "completions/mean_terminated_length": 271.34375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0824267603456974, + "epoch": 2.7119999999999997, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08523330837488174, + "learning_rate": 8.975972042185687e-07, + "loss": -0.0042, + "num_tokens": 11824242.0, + "reward": 13.403091430664062, + "reward_std": 1.459092378616333, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.510202169418335, + "rewards/kidney_reward/std": 0.5214763283729553, + "rewards/length2tails_reward/mean": 0.6743561029434204, + "rewards/length2tails_reward/std": 0.30871862173080444, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4217782020568848, + "rewards/thermo_reward/std": 1.0372663736343384, + "step": 1356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.6875, + "completions/mean_terminated_length": 271.6875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09593377634882927, + "epoch": 2.714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32700106501579285, + "learning_rate": 8.963217469802226e-07, + "loss": 0.005, + "num_tokens": 11832968.0, + "reward": 13.640189170837402, + "reward_std": 1.546877145767212, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5230116844177246, + "rewards/kidney_reward/std": 0.585303783416748, + "rewards/length2tails_reward/mean": 0.744003176689148, + "rewards/length2tails_reward/std": 0.2514055371284485, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.581591844558716, + "rewards/thermo_reward/std": 0.9852604269981384, + "step": 1357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.71875, + "completions/mean_terminated_length": 271.71875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08936795685440302, + "epoch": 2.716, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07124438136816025, + "learning_rate": 8.950464602147132e-07, + "loss": -0.0029, + "num_tokens": 11841695.0, + "reward": 13.338068008422852, + "reward_std": 1.5099149942398071, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7638334631919861, + "rewards/length2tails_reward/std": 0.24104620516300201, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2287392616271973, + "rewards/thermo_reward/std": 1.305456519126892, + "step": 1358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.03125, + "completions/mean_terminated_length": 271.03125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09379763435572386, + "epoch": 2.718, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13047315180301666, + "learning_rate": 8.93771346018929e-07, + "loss": 0.0011, + "num_tokens": 11850400.0, + "reward": 12.898988723754883, + "reward_std": 3.223174571990967, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.650642991065979, + "rewards/kidney_reward/mean": 2.3975424766540527, + "rewards/kidney_reward/std": 0.9989736676216125, + "rewards/length2tails_reward/mean": 0.7121099233627319, + "rewards/length2tails_reward/std": 0.2626727223396301, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0840697288513184, + "rewards/thermo_reward/std": 1.7830955982208252, + "step": 1359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.1875, + "completions/mean_terminated_length": 271.1875, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.07782580750063062, + "epoch": 2.7199999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10690188407897949, + "learning_rate": 8.924964064894753e-07, + "loss": 0.0012, + "num_tokens": 11859110.0, + "reward": 13.796202659606934, + "reward_std": 0.47236502170562744, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7891558408737183, + "rewards/length2tails_reward/std": 0.2034577876329422, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 271.46875, + "completions/mean_terminated_length": 271.46875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.08494175039231777, + "epoch": 2.722, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12061955779790878, + "learning_rate": 8.912216437226691e-07, + "loss": 0.0057, + "num_tokens": 11867829.0, + "reward": 13.42155647277832, + "reward_std": 1.3494116067886353, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7937526106834412, + "rewards/length2tails_reward/std": 0.21244557201862335, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.309235095977783, + "rewards/thermo_reward/std": 1.156217098236084, + "step": 1361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.5625, + "completions/mean_terminated_length": 272.5625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09255943074822426, + "epoch": 2.724, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10911893844604492, + "learning_rate": 8.899470598145384e-07, + "loss": -0.0028, + "num_tokens": 11876583.0, + "reward": 13.61441421508789, + "reward_std": 1.3774797916412354, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.806898832321167, + "rewards/length2tails_reward/std": 0.25244489312171936, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4734182357788086, + "rewards/thermo_reward/std": 1.2543762922286987, + "step": 1362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09354557655751705, + "epoch": 2.726, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12077096849679947, + "learning_rate": 8.886726568608154e-07, + "loss": 0.0025, + "num_tokens": 11885341.0, + "reward": 13.799593925476074, + "reward_std": 0.46591177582740784, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8230618834495544, + "rewards/length2tails_reward/std": 0.21043244004249573, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.84375, + "completions/mean_terminated_length": 272.84375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08867257367819548, + "epoch": 2.7279999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07082948088645935, + "learning_rate": 8.873984369569358e-07, + "loss": -0.0058, + "num_tokens": 11894104.0, + "reward": 13.649602890014648, + "reward_std": 1.1290522813796997, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7938203811645508, + "rewards/length2tails_reward/std": 0.2440522462129593, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5099151134490967, + "rewards/thermo_reward/std": 0.9728322625160217, + "step": 1364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 271.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08561956882476807, + "epoch": 2.73, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09584212303161621, + "learning_rate": 8.861244021980343e-07, + "loss": -0.0029, + "num_tokens": 11902832.0, + "reward": 13.62385368347168, + "reward_std": 1.2382583618164062, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7532392740249634, + "rewards/length2tails_reward/std": 0.28448599576950073, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4882240295410156, + "rewards/thermo_reward/std": 1.0843724012374878, + "step": 1365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.75, + "completions/mean_terminated_length": 272.75, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09875656943768263, + "epoch": 2.732, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08536585420370102, + "learning_rate": 8.848505546789406e-07, + "loss": -0.0008, + "num_tokens": 11911592.0, + "reward": 13.630233764648438, + "reward_std": 1.0378588438034058, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8287537097930908, + "rewards/length2tails_reward/std": 0.1734320968389511, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4870524406433105, + "rewards/thermo_reward/std": 0.9019082188606262, + "step": 1366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09804850351065397, + "epoch": 2.734, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1275150179862976, + "learning_rate": 8.835768964941772e-07, + "loss": -0.0042, + "num_tokens": 11920323.0, + "reward": 13.905082702636719, + "reward_std": 0.3192027807235718, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.6813804507255554, + "rewards/length2tails_reward/std": 0.2995351254940033, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.78125, + "completions/mean_terminated_length": 271.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08263131324201822, + "epoch": 2.7359999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09288492053747177, + "learning_rate": 8.823034297379546e-07, + "loss": -0.0018, + "num_tokens": 11929052.0, + "reward": 13.093076705932617, + "reward_std": 3.034579038619995, + "rewards/fitness_reward/mean": 6.9872846603393555, + "rewards/fitness_reward/std": 2.1151013374328613, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7136950492858887, + "rewards/length2tails_reward/std": 0.33012154698371887, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3353025913238525, + "rewards/thermo_reward/std": 0.9062677621841431, + "step": 1368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.9375, + "completions/mean_terminated_length": 271.9375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08855268033221364, + "epoch": 2.738, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09174522757530212, + "learning_rate": 8.810301565041691e-07, + "loss": -0.0028, + "num_tokens": 11937786.0, + "reward": 13.751238822937012, + "reward_std": 0.509857177734375, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7383784055709839, + "rewards/length2tails_reward/std": 0.29278847575187683, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 1369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.90625, + "completions/mean_terminated_length": 270.90625, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 0.09602725226432085, + "epoch": 2.74, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1331060528755188, + "learning_rate": 8.797570788863988e-07, + "loss": -0.002, + "num_tokens": 11946487.0, + "reward": 13.524124145507812, + "reward_std": 1.7451503276824951, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.510956287384033, + "rewards/kidney_reward/std": 0.5174046158790588, + "rewards/length2tails_reward/mean": 0.7649425864219666, + "rewards/length2tails_reward/std": 0.2521741986274719, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.475489616394043, + "rewards/thermo_reward/std": 1.2552281618118286, + "step": 1370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.25, + "completions/mean_terminated_length": 271.25, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09053944982588291, + "epoch": 2.742, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10725681483745575, + "learning_rate": 8.784841989778996e-07, + "loss": 0.0016, + "num_tokens": 11955199.0, + "reward": 13.909860610961914, + "reward_std": 0.31133556365966797, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7291494011878967, + "rewards/length2tails_reward/std": 0.26707524061203003, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08970591751858592, + "epoch": 2.7439999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10110809653997421, + "learning_rate": 8.772115188716032e-07, + "loss": 0.0012, + "num_tokens": 11963915.0, + "reward": 13.511757850646973, + "reward_std": 1.1564346551895142, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.6929255723953247, + "rewards/length2tails_reward/std": 0.3160499632358551, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.436877965927124, + "rewards/thermo_reward/std": 0.9635177850723267, + "step": 1372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.625, + "completions/mean_terminated_length": 272.625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08349447511136532, + "epoch": 2.746, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08993352204561234, + "learning_rate": 8.759390406601115e-07, + "loss": -0.0005, + "num_tokens": 11972671.0, + "reward": 13.653955459594727, + "reward_std": 0.9975864887237549, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.799585223197937, + "rewards/length2tails_reward/std": 0.20988598465919495, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.513690710067749, + "rewards/thermo_reward/std": 0.9536970257759094, + "step": 1373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.90625, + "completions/mean_terminated_length": 273.90625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09851919766515493, + "epoch": 2.748, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14624251425266266, + "learning_rate": 8.746667664356955e-07, + "loss": 0.0009, + "num_tokens": 11981468.0, + "reward": 13.551703453063965, + "reward_std": 1.1514352560043335, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8635172843933105, + "rewards/length2tails_reward/std": 0.1821584850549698, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4324052333831787, + "rewards/thermo_reward/std": 0.9851658940315247, + "step": 1374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.78125, + "completions/mean_terminated_length": 271.78125, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "entropy": 0.08834454417228699, + "epoch": 2.75, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1254044473171234, + "learning_rate": 8.733946982902911e-07, + "loss": 0.004, + "num_tokens": 11990197.0, + "reward": 13.839938163757324, + "reward_std": 0.4260871112346649, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8276517987251282, + "rewards/length2tails_reward/std": 0.20928896963596344, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08731277380138636, + "epoch": 2.752, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13713133335113525, + "learning_rate": 8.721228383154939e-07, + "loss": 0.0008, + "num_tokens": 11998932.0, + "reward": 13.650564193725586, + "reward_std": 0.9174665212631226, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7925777435302734, + "rewards/length2tails_reward/std": 0.23623481392860413, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4836416244506836, + "rewards/thermo_reward/std": 0.9184372425079346, + "step": 1376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08832689933478832, + "epoch": 2.754, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0689794272184372, + "learning_rate": 8.708511886025583e-07, + "loss": -0.0055, + "num_tokens": 12007696.0, + "reward": 13.400384902954102, + "reward_std": 2.495495557785034, + "rewards/fitness_reward/mean": 7.051910400390625, + "rewards/fitness_reward/std": 1.7495219707489014, + "rewards/kidney_reward/mean": 2.5385398864746094, + "rewards/kidney_reward/std": 0.49746304750442505, + "rewards/length2tails_reward/mean": 0.8031256198883057, + "rewards/length2tails_reward/std": 0.25186508893966675, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.0625, + "completions/mean_terminated_length": 272.0625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08293364942073822, + "epoch": 2.7560000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12998053431510925, + "learning_rate": 8.695797512423931e-07, + "loss": 0.0027, + "num_tokens": 12016434.0, + "reward": 13.634329795837402, + "reward_std": 0.5833243727684021, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7658626437187195, + "rewards/length2tails_reward/std": 0.2717556953430176, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.470078706741333, + "rewards/thermo_reward/std": 0.5830413699150085, + "step": 1378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.78125, + "completions/mean_terminated_length": 271.78125, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.086912976577878, + "epoch": 2.758, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06801780313253403, + "learning_rate": 8.683085283255576e-07, + "loss": -0.0011, + "num_tokens": 12025163.0, + "reward": 13.779071807861328, + "reward_std": 0.845180094242096, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8288226127624512, + "rewards/length2tails_reward/std": 0.22464440762996674, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.6085243225097656, + "rewards/thermo_reward/std": 0.8422261476516724, + "step": 1379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.90625, + "completions/mean_terminated_length": 270.90625, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "entropy": 0.0917194364592433, + "epoch": 2.76, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16181787848472595, + "learning_rate": 8.670375219422577e-07, + "loss": -0.005, + "num_tokens": 12033864.0, + "reward": 12.928831100463867, + "reward_std": 2.7798335552215576, + "rewards/fitness_reward/mean": 6.994358062744141, + "rewards/fitness_reward/std": 1.7694021463394165, + "rewards/kidney_reward/mean": 2.5039236545562744, + "rewards/kidney_reward/std": 0.5554940104484558, + "rewards/length2tails_reward/mean": 0.7913016080856323, + "rewards/length2tails_reward/std": 0.2935396432876587, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.251418352127075, + "rewards/thermo_reward/std": 1.1121165752410889, + "step": 1380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.089608708396554, + "epoch": 2.762, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15328380465507507, + "learning_rate": 8.657667341823448e-07, + "loss": 0.0052, + "num_tokens": 12042585.0, + "reward": 13.086148262023926, + "reward_std": 2.592722177505493, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.3998496532440186, + "rewards/kidney_reward/std": 0.796771228313446, + "rewards/length2tails_reward/mean": 0.7404778599739075, + "rewards/length2tails_reward/std": 0.28793251514434814, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2085750102996826, + "rewards/thermo_reward/std": 1.6067379713058472, + "step": 1381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 285.6875, + "completions/mean_terminated_length": 270.58062744140625, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "entropy": 0.0945020318031311, + "epoch": 2.7640000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5476410984992981, + "learning_rate": 8.644961671353095e-07, + "loss": -0.0133, + "num_tokens": 12051759.0, + "reward": 13.800397872924805, + "reward_std": 0.882570743560791, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7528252601623535, + "rewards/length2tails_reward/std": 0.2854486107826233, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.6374502182006836, + "rewards/thermo_reward/std": 0.8803324103355408, + "step": 1382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.28125, + "completions/mean_terminated_length": 272.28125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09110002871602774, + "epoch": 2.766, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09588117897510529, + "learning_rate": 8.632258228902804e-07, + "loss": -0.0019, + "num_tokens": 12060504.0, + "reward": 13.497045516967773, + "reward_std": 1.180594801902771, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7557010054588318, + "rewards/length2tails_reward/std": 0.24613431096076965, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3885293006896973, + "rewards/thermo_reward/std": 1.0135005712509155, + "step": 1383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.40625, + "completions/mean_terminated_length": 273.40625, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.09245431423187256, + "epoch": 2.768, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4146077036857605, + "learning_rate": 8.619557035360195e-07, + "loss": -0.0036, + "num_tokens": 12069285.0, + "reward": 13.30376148223877, + "reward_std": 2.591141939163208, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.4650182723999023, + "rewards/kidney_reward/std": 0.7693151831626892, + "rewards/length2tails_reward/mean": 0.845126748085022, + "rewards/length2tails_reward/std": 0.19638904929161072, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3505544662475586, + "rewards/thermo_reward/std": 1.5948100090026855, + "step": 1384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.78125, + "completions/mean_terminated_length": 271.78125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08536792267113924, + "epoch": 2.77, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10885151475667953, + "learning_rate": 8.606858111609187e-07, + "loss": -0.0003, + "num_tokens": 12078014.0, + "reward": 13.27672004699707, + "reward_std": 2.586475133895874, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4506547451019287, + "rewards/kidney_reward/std": 0.8490661382675171, + "rewards/length2tails_reward/mean": 0.7651956677436829, + "rewards/length2tails_reward/std": 0.270557165145874, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2883613109588623, + "rewards/thermo_reward/std": 1.8263949155807495, + "step": 1385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 274.4375, + "completions/mean_terminated_length": 274.4375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08724306477233768, + "epoch": 2.7720000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08859222382307053, + "learning_rate": 8.594161478529973e-07, + "loss": -0.007, + "num_tokens": 12086828.0, + "reward": 13.583634376525879, + "reward_std": 1.3034932613372803, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8490362167358398, + "rewards/length2tails_reward/std": 0.22278374433517456, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4110660552978516, + "rewards/thermo_reward/std": 1.2885096073150635, + "step": 1386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.40625, + "completions/mean_terminated_length": 272.40625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08765661623328924, + "epoch": 2.774, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.46068093180656433, + "learning_rate": 8.58146715699897e-07, + "loss": 0.0018, + "num_tokens": 12095577.0, + "reward": 11.891407012939453, + "reward_std": 6.213120460510254, + "rewards/fitness_reward/mean": 6.643294334411621, + "rewards/fitness_reward/std": 2.8548996448516846, + "rewards/kidney_reward/mean": 2.1450510025024414, + "rewards/kidney_reward/std": 1.608840823173523, + "rewards/length2tails_reward/mean": 0.7848142981529236, + "rewards/length2tails_reward/std": 0.25247421860694885, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.9245800971984863, + "rewards/thermo_reward/std": 2.1157755851745605, + "step": 1387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 271.78125, + "completions/mean_terminated_length": 271.78125, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "entropy": 0.09168736450374126, + "epoch": 2.776, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10756999254226685, + "learning_rate": 8.568775167888805e-07, + "loss": -0.0002, + "num_tokens": 12104306.0, + "reward": 13.713671684265137, + "reward_std": 0.965480625629425, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7663874626159668, + "rewards/length2tails_reward/std": 0.2602115273475647, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.549368143081665, + "rewards/thermo_reward/std": 0.9606295228004456, + "step": 1388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.90625, + "completions/mean_terminated_length": 271.90625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09946115221828222, + "epoch": 2.778, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05923334136605263, + "learning_rate": 8.556085532068266e-07, + "loss": -0.005, + "num_tokens": 12113039.0, + "reward": 13.344707489013672, + "reward_std": 3.0077295303344727, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.5390896797180176, + "rewards/kidney_reward/std": 0.49435171484947205, + "rewards/length2tails_reward/mean": 0.7656229138374329, + "rewards/length2tails_reward/std": 0.27780574560165405, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5760021209716797, + "rewards/thermo_reward/std": 0.8242037892341614, + "step": 1389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.90625, + "completions/mean_terminated_length": 272.90625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08487782347947359, + "epoch": 2.7800000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1074460819363594, + "learning_rate": 8.543398270402264e-07, + "loss": -0.0027, + "num_tokens": 12121804.0, + "reward": 13.341554641723633, + "reward_std": 1.6601860523223877, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7996214032173157, + "rewards/length2tails_reward/std": 0.27192988991737366, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2286460399627686, + "rewards/thermo_reward/std": 1.4481639862060547, + "step": 1390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.90625, + "completions/mean_terminated_length": 272.90625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08386108372360468, + "epoch": 2.782, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10229164361953735, + "learning_rate": 8.530713403751821e-07, + "loss": 0.0001, + "num_tokens": 12130569.0, + "reward": 13.57174015045166, + "reward_std": 0.6612291932106018, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.812415361404419, + "rewards/length2tails_reward/std": 0.2513531446456909, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4301929473876953, + "rewards/thermo_reward/std": 0.6010707020759583, + "step": 1391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 271.21875, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "entropy": 0.08745740633457899, + "epoch": 2.784, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15082944929599762, + "learning_rate": 8.518030952974009e-07, + "loss": 0.0019, + "num_tokens": 12139280.0, + "reward": 13.673470497131348, + "reward_std": 0.9771033525466919, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7747855186462402, + "rewards/length2tails_reward/std": 0.25838014483451843, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5356860160827637, + "rewards/thermo_reward/std": 0.8443465232849121, + "step": 1392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.0625, + "completions/mean_terminated_length": 272.0625, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "entropy": 0.0878392974846065, + "epoch": 2.786, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0767536535859108, + "learning_rate": 8.50535093892193e-07, + "loss": 0.0034, + "num_tokens": 12148018.0, + "reward": 13.795819282531738, + "reward_std": 0.46981340646743774, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7853190898895264, + "rewards/length2tails_reward/std": 0.24550072848796844, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.28125, + "completions/mean_terminated_length": 272.28125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09728020057082176, + "epoch": 2.7880000000000003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10178124159574509, + "learning_rate": 8.492673382444686e-07, + "loss": -0.0005, + "num_tokens": 12156763.0, + "reward": 13.9190673828125, + "reward_std": 0.3134841024875641, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8212195634841919, + "rewards/length2tails_reward/std": 0.1988692432641983, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.59375, + "completions/mean_terminated_length": 271.59375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08629786409437656, + "epoch": 2.79, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.058435454964637756, + "learning_rate": 8.479998304387328e-07, + "loss": -0.005, + "num_tokens": 12165486.0, + "reward": 13.33970832824707, + "reward_std": 3.007207155227661, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.5390896797180176, + "rewards/kidney_reward/std": 0.49435171484947205, + "rewards/length2tails_reward/mean": 0.7156283855438232, + "rewards/length2tails_reward/std": 0.30288150906562805, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5760021209716797, + "rewards/thermo_reward/std": 0.8242037892341614, + "step": 1395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.75, + "completions/mean_terminated_length": 269.75, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.09454571455717087, + "epoch": 2.792, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09265205264091492, + "learning_rate": 8.46732572559084e-07, + "loss": -0.0003, + "num_tokens": 12174150.0, + "reward": 13.47976303100586, + "reward_std": 1.12965989112854, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7220099568367004, + "rewards/length2tails_reward/std": 0.24405024945735931, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.319897174835205, + "rewards/thermo_reward/std": 1.1181665658950806, + "step": 1396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 682.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 284.15625, + "completions/mean_terminated_length": 284.15625, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "entropy": 0.11376719083636999, + "epoch": 2.794, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5755971074104309, + "learning_rate": 8.454655666892094e-07, + "loss": -0.0072, + "num_tokens": 12183275.0, + "reward": 13.46137523651123, + "reward_std": 1.9065113067626953, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.520040273666382, + "rewards/kidney_reward/std": 0.6021116971969604, + "rewards/length2tails_reward/mean": 0.7784010171890259, + "rewards/length2tails_reward/std": 0.28705984354019165, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4598193168640137, + "rewards/thermo_reward/std": 1.0365558862686157, + "step": 1397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.5, + "completions/mean_terminated_length": 271.5, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.08584170322865248, + "epoch": 2.7960000000000003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0858711525797844, + "learning_rate": 8.441988149123817e-07, + "loss": 0.0018, + "num_tokens": 12191995.0, + "reward": 13.758430480957031, + "reward_std": 1.163153886795044, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8490825891494751, + "rewards/length2tails_reward/std": 0.18313346803188324, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.6132168769836426, + "rewards/thermo_reward/std": 1.013451099395752, + "step": 1398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.03125, + "completions/mean_terminated_length": 273.03125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08808640111237764, + "epoch": 2.798, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05051012709736824, + "learning_rate": 8.42932319311456e-07, + "loss": -0.0034, + "num_tokens": 12200764.0, + "reward": 13.767698287963867, + "reward_std": 0.9061033725738525, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8230335712432861, + "rewards/length2tails_reward/std": 0.24817439913749695, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5977301597595215, + "rewards/thermo_reward/std": 0.8991841077804565, + "step": 1399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.53125, + "completions/mean_terminated_length": 272.53125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09016821812838316, + "epoch": 2.8, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08677458018064499, + "learning_rate": 8.416660819688658e-07, + "loss": -0.0029, + "num_tokens": 12209517.0, + "reward": 13.91813850402832, + "reward_std": 0.3208127021789551, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8119388818740845, + "rewards/length2tails_reward/std": 0.25621432065963745, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.375, + "completions/mean_terminated_length": 273.375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08616777881979942, + "epoch": 2.802, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12825022637844086, + "learning_rate": 8.40400104966621e-07, + "loss": 0.0059, + "num_tokens": 12218297.0, + "reward": 13.762215614318848, + "reward_std": 0.5018707513809204, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8481432795524597, + "rewards/length2tails_reward/std": 0.17002004384994507, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 1401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.90625, + "completions/mean_terminated_length": 271.90625, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "entropy": 0.08310704492032528, + "epoch": 2.8040000000000003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06590055674314499, + "learning_rate": 8.391343903863017e-07, + "loss": -0.0002, + "num_tokens": 12227030.0, + "reward": 13.777181625366211, + "reward_std": 1.0704293251037598, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8627910614013672, + "rewards/length2tails_reward/std": 0.1706744283437729, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.6305975914001465, + "rewards/thermo_reward/std": 0.9178563356399536, + "step": 1402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.625, + "completions/mean_terminated_length": 270.625, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "entropy": 0.08667949680238962, + "epoch": 2.806, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24901294708251953, + "learning_rate": 8.378689403090582e-07, + "loss": 0.0021, + "num_tokens": 12235722.0, + "reward": 13.747515678405762, + "reward_std": 0.4992513060569763, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7011414766311646, + "rewards/length2tails_reward/std": 0.25532442331314087, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 1403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.53125, + "completions/mean_terminated_length": 272.53125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.07546252477914095, + "epoch": 2.808, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1531989723443985, + "learning_rate": 8.366037568156047e-07, + "loss": -0.0059, + "num_tokens": 12244475.0, + "reward": 13.314666748046875, + "reward_std": 2.8027918338775635, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.4331979751586914, + "rewards/kidney_reward/std": 0.9463348984718323, + "rewards/length2tails_reward/mean": 0.7847499251365662, + "rewards/length2tails_reward/std": 0.23787519335746765, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3993189334869385, + "rewards/thermo_reward/std": 1.6910643577575684, + "step": 1404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 272.625, + "completions/mean_terminated_length": 272.625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09690284915268421, + "epoch": 2.81, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12429013848304749, + "learning_rate": 8.353388419862178e-07, + "loss": -0.0065, + "num_tokens": 12253231.0, + "reward": 13.412312507629395, + "reward_std": 1.657273530960083, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8280727863311768, + "rewards/length2tails_reward/std": 0.23345917463302612, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.354067802429199, + "rewards/thermo_reward/std": 1.2732762098312378, + "step": 1405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 270.65625, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.08442273642867804, + "epoch": 2.8120000000000003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09026562422513962, + "learning_rate": 8.340741979007324e-07, + "loss": 0.0047, + "num_tokens": 12261924.0, + "reward": 13.80896282196045, + "reward_std": 0.5169821977615356, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7914867401123047, + "rewards/length2tails_reward/std": 0.21802614629268646, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.09375, + "completions/mean_terminated_length": 272.09375, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.08891510032117367, + "epoch": 2.814, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09019751101732254, + "learning_rate": 8.328098266385373e-07, + "loss": 0.0019, + "num_tokens": 12270663.0, + "reward": 13.716066360473633, + "reward_std": 1.1704113483428955, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8142136335372925, + "rewards/length2tails_reward/std": 0.22736941277980804, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.546980381011963, + "rewards/thermo_reward/std": 1.1725293397903442, + "step": 1407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 274.21875, + "completions/mean_terminated_length": 274.21875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09006855543702841, + "epoch": 2.816, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08318524062633514, + "learning_rate": 8.315457302785742e-07, + "loss": -0.0012, + "num_tokens": 12279470.0, + "reward": 13.925493240356445, + "reward_std": 0.313385009765625, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8854849934577942, + "rewards/length2tails_reward/std": 0.14626377820968628, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.375, + "completions/mean_terminated_length": 273.375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09508083388209343, + "epoch": 2.818, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06987155228853226, + "learning_rate": 8.302819108993311e-07, + "loss": -0.0013, + "num_tokens": 12288250.0, + "reward": 13.839058876037598, + "reward_std": 0.4329962432384491, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8188518285751343, + "rewards/length2tails_reward/std": 0.2515736520290375, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.03125, + "completions/mean_terminated_length": 272.03125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08047976158559322, + "epoch": 2.82, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10217437148094177, + "learning_rate": 8.290183705788418e-07, + "loss": -0.0044, + "num_tokens": 12296987.0, + "reward": 13.645854949951172, + "reward_std": 1.115564227104187, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.539055347442627, + "rewards/kidney_reward/std": 0.4945458173751831, + "rewards/length2tails_reward/mean": 0.7350102663040161, + "rewards/length2tails_reward/std": 0.2945704162120819, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.09375, + "completions/mean_terminated_length": 271.09375, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.08697906043380499, + "epoch": 2.822, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09989477694034576, + "learning_rate": 8.277551113946811e-07, + "loss": 0.0034, + "num_tokens": 12305694.0, + "reward": 13.754642486572266, + "reward_std": 0.5036185383796692, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7724176645278931, + "rewards/length2tails_reward/std": 0.25681835412979126, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 1411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.34375, + "completions/mean_terminated_length": 272.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08093288633972406, + "epoch": 2.824, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0949145033955574, + "learning_rate": 8.264921354239608e-07, + "loss": 0.0011, + "num_tokens": 12314441.0, + "reward": 13.752120971679688, + "reward_std": 0.5054385662078857, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7471965551376343, + "rewards/length2tails_reward/std": 0.2987217903137207, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 1412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.9375, + "completions/mean_terminated_length": 271.9375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08343170490115881, + "epoch": 2.826, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.061486851423978806, + "learning_rate": 8.252294447433282e-07, + "loss": -0.0049, + "num_tokens": 12323175.0, + "reward": 13.275848388671875, + "reward_std": 3.016287326812744, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.511730670928955, + "rewards/kidney_reward/std": 0.5132253766059875, + "rewards/length2tails_reward/mean": 0.7494857311248779, + "rewards/length2tails_reward/std": 0.27652037143707275, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.536116123199463, + "rewards/thermo_reward/std": 0.8422485589981079, + "step": 1413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 273.375, + "completions/mean_terminated_length": 273.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08979653380811214, + "epoch": 2.828, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1077917143702507, + "learning_rate": 8.239670414289602e-07, + "loss": -0.0046, + "num_tokens": 12331955.0, + "reward": 13.600502967834473, + "reward_std": 1.2754502296447754, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7997991442680359, + "rewards/length2tails_reward/std": 0.27430152893066406, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.487576723098755, + "rewards/thermo_reward/std": 1.0877385139465332, + "step": 1414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.15625, + "completions/mean_terminated_length": 272.15625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08586056530475616, + "epoch": 2.83, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11145995557308197, + "learning_rate": 8.227049275565622e-07, + "loss": -0.002, + "num_tokens": 12340696.0, + "reward": 13.244773864746094, + "reward_std": 2.0113065242767334, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4455184936523438, + "rewards/kidney_reward/std": 0.7122268080711365, + "rewards/length2tails_reward/mean": 0.7700451612472534, + "rewards/length2tails_reward/std": 0.2829984128475189, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2610652446746826, + "rewards/thermo_reward/std": 1.3290748596191406, + "step": 1415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 728.0, + "completions/max_terminated_length": 728.0, + "completions/mean_length": 287.21875, + "completions/mean_terminated_length": 287.21875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09500201977789402, + "epoch": 2.832, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7574090361595154, + "learning_rate": 8.214431052013634e-07, + "loss": -0.0228, + "num_tokens": 12349919.0, + "reward": 13.81132698059082, + "reward_std": 0.5325340032577515, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8151376843452454, + "rewards/length2tails_reward/std": 0.28029194474220276, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.40625, + "completions/mean_terminated_length": 272.40625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08386566769331694, + "epoch": 2.834, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10139825195074081, + "learning_rate": 8.201815764381133e-07, + "loss": -0.0038, + "num_tokens": 12358668.0, + "reward": 12.858728408813477, + "reward_std": 3.645132541656494, + "rewards/fitness_reward/mean": 7.188657760620117, + "rewards/fitness_reward/std": 0.7179933190345764, + "rewards/kidney_reward/mean": 2.335433006286621, + "rewards/kidney_reward/std": 1.1454758644104004, + "rewards/length2tails_reward/mean": 0.7713392972946167, + "rewards/length2tails_reward/std": 0.2946726381778717, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.157503604888916, + "rewards/thermo_reward/std": 1.9195927381515503, + "step": 1417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.9375, + "completions/mean_terminated_length": 272.9375, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.08110349159687757, + "epoch": 2.836, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07257678359746933, + "learning_rate": 8.189203433410794e-07, + "loss": 0.0014, + "num_tokens": 12367434.0, + "reward": 13.879064559936523, + "reward_std": 0.38343676924705505, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8200562000274658, + "rewards/length2tails_reward/std": 0.276693731546402, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.40625, + "completions/mean_terminated_length": 271.40625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.07928761281073093, + "epoch": 2.838, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0934724509716034, + "learning_rate": 8.176594079840422e-07, + "loss": -0.0064, + "num_tokens": 12376151.0, + "reward": 13.905878067016602, + "reward_std": 0.3248470723628998, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.6893353462219238, + "rewards/length2tails_reward/std": 0.30571213364601135, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.8125, + "completions/mean_terminated_length": 272.8125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0859050927683711, + "epoch": 2.84, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08270053565502167, + "learning_rate": 8.163987724402934e-07, + "loss": -0.0029, + "num_tokens": 12384913.0, + "reward": 13.734323501586914, + "reward_std": 0.8570379614830017, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7574863433837891, + "rewards/length2tails_reward/std": 0.3112337589263916, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5709095001220703, + "rewards/thermo_reward/std": 0.8499181270599365, + "step": 1420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.8125, + "completions/mean_terminated_length": 270.8125, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.09258036874234676, + "epoch": 2.842, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1705041527748108, + "learning_rate": 8.151384387826313e-07, + "loss": 0.0069, + "num_tokens": 12393611.0, + "reward": 13.437908172607422, + "reward_std": 1.449847936630249, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8329051733016968, + "rewards/length2tails_reward/std": 0.23473943769931793, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.321671962738037, + "rewards/thermo_reward/std": 1.2519351243972778, + "step": 1421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.5625, + "completions/mean_terminated_length": 272.5625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08808799367398024, + "epoch": 2.844, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0893077403306961, + "learning_rate": 8.138784090833577e-07, + "loss": 0.008, + "num_tokens": 12402365.0, + "reward": 13.877182006835938, + "reward_std": 0.37219929695129395, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8012359142303467, + "rewards/length2tails_reward/std": 0.22292174398899078, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.875, + "completions/mean_terminated_length": 271.875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.08956156764179468, + "epoch": 2.846, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10173401236534119, + "learning_rate": 8.126186854142751e-07, + "loss": 0.0061, + "num_tokens": 12411097.0, + "reward": 13.678201675415039, + "reward_std": 0.5565140247344971, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8057146668434143, + "rewards/length2tails_reward/std": 0.2487732470035553, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.50996470451355, + "rewards/thermo_reward/std": 0.5615194439888, + "step": 1423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.5625, + "completions/mean_terminated_length": 271.5625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08605887554585934, + "epoch": 2.848, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07781686633825302, + "learning_rate": 8.11359269846683e-07, + "loss": -0.0006, + "num_tokens": 12419819.0, + "reward": 13.564287185668945, + "reward_std": 1.3377314805984497, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7410205602645874, + "rewards/length2tails_reward/std": 0.24903921782970428, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4298787117004395, + "rewards/thermo_reward/std": 1.1900341510772705, + "step": 1424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.1875, + "completions/mean_terminated_length": 273.1875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.0928757581859827, + "epoch": 2.85, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11290508508682251, + "learning_rate": 8.101001644513731e-07, + "loss": 0.0043, + "num_tokens": 12428593.0, + "reward": 13.877665519714355, + "reward_std": 0.37265855073928833, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8060607314109802, + "rewards/length2tails_reward/std": 0.24503538012504578, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 615.0, + "completions/max_terminated_length": 615.0, + "completions/mean_length": 283.53125, + "completions/mean_terminated_length": 283.53125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09894910361617804, + "epoch": 2.852, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3717043995857239, + "learning_rate": 8.088413712986279e-07, + "loss": -0.0102, + "num_tokens": 12437698.0, + "reward": 13.021846771240234, + "reward_std": 3.988328695297241, + "rewards/fitness_reward/mean": 7.037449836730957, + "rewards/fitness_reward/std": 1.8313246965408325, + "rewards/kidney_reward/mean": 2.465315341949463, + "rewards/kidney_reward/std": 0.7676700353622437, + "rewards/length2tails_reward/mean": 0.7774168252944946, + "rewards/length2tails_reward/std": 0.2898247241973877, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3413400650024414, + "rewards/thermo_reward/std": 1.4518558979034424, + "step": 1426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.21875, + "completions/mean_terminated_length": 273.21875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.0861760089173913, + "epoch": 2.854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10765953361988068, + "learning_rate": 8.075828924582168e-07, + "loss": 0.0008, + "num_tokens": 12446473.0, + "reward": 13.878617286682129, + "reward_std": 0.3746810257434845, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8155795335769653, + "rewards/length2tails_reward/std": 0.25068867206573486, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.03125, + "completions/mean_terminated_length": 272.03125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08908339031040668, + "epoch": 2.856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11010465025901794, + "learning_rate": 8.063247299993918e-07, + "loss": -0.0018, + "num_tokens": 12455210.0, + "reward": 13.054346084594727, + "reward_std": 2.4582793712615967, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.4899649620056152, + "rewards/kidney_reward/std": 0.631715714931488, + "rewards/length2tails_reward/mean": 0.777817964553833, + "rewards/length2tails_reward/std": 0.24456126987934113, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.082923412322998, + "rewards/thermo_reward/std": 1.7510353326797485, + "step": 1428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.0625, + "completions/mean_terminated_length": 272.0625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10221673734486103, + "epoch": 2.858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10918369144201279, + "learning_rate": 8.05066885990885e-07, + "loss": -0.0024, + "num_tokens": 12463948.0, + "reward": 13.874887466430664, + "reward_std": 0.38162562251091003, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7782790064811707, + "rewards/length2tails_reward/std": 0.2557736933231354, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.90625, + "completions/mean_terminated_length": 273.90625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.07712917029857635, + "epoch": 2.86, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08574642241001129, + "learning_rate": 8.03809362500905e-07, + "loss": 0.001, + "num_tokens": 12472745.0, + "reward": 13.801877975463867, + "reward_std": 0.47052812576293945, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8459126353263855, + "rewards/length2tails_reward/std": 0.2445998638868332, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 271.59375, + "completions/mean_terminated_length": 271.59375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.0909727094694972, + "epoch": 2.862, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10815263539552689, + "learning_rate": 8.025521615971329e-07, + "loss": 0.0008, + "num_tokens": 12481468.0, + "reward": 13.79372787475586, + "reward_std": 0.47003355622291565, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7644029855728149, + "rewards/length2tails_reward/std": 0.22795794904232025, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 663.0, + "completions/max_terminated_length": 663.0, + "completions/mean_length": 284.90625, + "completions/mean_terminated_length": 284.90625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09156226739287376, + "epoch": 2.864, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.36374738812446594, + "learning_rate": 8.012952853467202e-07, + "loss": -0.0119, + "num_tokens": 12490617.0, + "reward": 13.561015129089355, + "reward_std": 1.168427586555481, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7868642807006836, + "rewards/length2tails_reward/std": 0.2786427438259125, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.422022819519043, + "rewards/thermo_reward/std": 1.0360568761825562, + "step": 1432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.75, + "completions/mean_terminated_length": 272.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.092946563847363, + "epoch": 2.866, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2058488130569458, + "learning_rate": 8.000387358162834e-07, + "loss": 0.0008, + "num_tokens": 12499377.0, + "reward": 12.085588455200195, + "reward_std": 4.778906345367432, + "rewards/fitness_reward/mean": 6.6559247970581055, + "rewards/fitness_reward/std": 2.7804884910583496, + "rewards/kidney_reward/mean": 2.268216371536255, + "rewards/kidney_reward/std": 1.1071895360946655, + "rewards/length2tails_reward/mean": 0.7589725255966187, + "rewards/length2tails_reward/std": 0.3145608901977539, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.98555064201355, + "rewards/thermo_reward/std": 1.6683814525604248, + "step": 1433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.25, + "completions/mean_terminated_length": 273.25, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08142790850251913, + "epoch": 2.868, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08400456607341766, + "learning_rate": 7.98782515071903e-07, + "loss": 0.001, + "num_tokens": 12508153.0, + "reward": 13.958139419555664, + "reward_std": 0.22377903759479523, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8130799531936646, + "rewards/length2tails_reward/std": 0.24826769530773163, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7891664505004883, + "rewards/thermo_reward/std": 0.225629061460495, + "step": 1434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.34375, + "completions/mean_terminated_length": 273.34375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08738233055919409, + "epoch": 2.87, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10952440649271011, + "learning_rate": 7.975266251791184e-07, + "loss": -0.0004, + "num_tokens": 12516932.0, + "reward": 13.879385948181152, + "reward_std": 0.3758961260318756, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8232645988464355, + "rewards/length2tails_reward/std": 0.21714530885219574, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08811240177601576, + "epoch": 2.872, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2943911850452423, + "learning_rate": 7.962710682029244e-07, + "loss": -0.0014, + "num_tokens": 12525690.0, + "reward": 13.523576736450195, + "reward_std": 1.5703537464141846, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7819285988807678, + "rewards/length2tails_reward/std": 0.25686317682266235, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3850791454315186, + "rewards/thermo_reward/std": 1.4264148473739624, + "step": 1436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.96875, + "completions/mean_terminated_length": 273.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0862605981528759, + "epoch": 2.874, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10683736950159073, + "learning_rate": 7.950158462077697e-07, + "loss": 0.0024, + "num_tokens": 12534489.0, + "reward": 13.596105575561523, + "reward_std": 1.2210086584091187, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8528375625610352, + "rewards/length2tails_reward/std": 0.22560538351535797, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.45051646232605, + "rewards/thermo_reward/std": 1.083735466003418, + "step": 1437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09131797589361668, + "epoch": 2.876, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.089107945561409, + "learning_rate": 7.937609612575511e-07, + "loss": 0.0047, + "num_tokens": 12543220.0, + "reward": 13.794271469116211, + "reward_std": 0.4652538597583771, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7698436975479126, + "rewards/length2tails_reward/std": 0.20634371042251587, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.46875, + "completions/mean_terminated_length": 272.46875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08963609486818314, + "epoch": 2.878, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08300493657588959, + "learning_rate": 7.925064154156114e-07, + "loss": -0.0056, + "num_tokens": 12551971.0, + "reward": 13.066444396972656, + "reward_std": 3.2766692638397217, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.511730670928955, + "rewards/kidney_reward/std": 0.5132253766059875, + "rewards/length2tails_reward/mean": 0.7815616130828857, + "rewards/length2tails_reward/std": 0.26363322138786316, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.323504686355591, + "rewards/thermo_reward/std": 1.4982414245605469, + "step": 1439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.34375, + "completions/mean_terminated_length": 273.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08487140201032162, + "epoch": 2.88, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14700040221214294, + "learning_rate": 7.912522107447366e-07, + "loss": 0.0016, + "num_tokens": 12560750.0, + "reward": 13.799543380737305, + "reward_std": 0.46718329191207886, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8225642442703247, + "rewards/length2tails_reward/std": 0.2352154701948166, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.6296226978302, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.59375, + "completions/mean_terminated_length": 271.59375, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "entropy": 0.08915096707642078, + "epoch": 2.882, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12797556817531586, + "learning_rate": 7.899983493071506e-07, + "loss": 0.0027, + "num_tokens": 12569473.0, + "reward": 13.651243209838867, + "reward_std": 0.6270330548286438, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8097268342971802, + "rewards/length2tails_reward/std": 0.28132137656211853, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.50996470451355, + "rewards/thermo_reward/std": 0.5615194439888, + "step": 1441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.9375, + "completions/mean_terminated_length": 272.9375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09121816977858543, + "epoch": 2.884, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09454463422298431, + "learning_rate": 7.887448331645137e-07, + "loss": 0.001, + "num_tokens": 12578239.0, + "reward": 13.510807037353516, + "reward_std": 1.0569266080856323, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8264129161834717, + "rewards/length2tails_reward/std": 0.21036264300346375, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3678598403930664, + "rewards/thermo_reward/std": 0.9316072463989258, + "step": 1442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.0625, + "completions/mean_terminated_length": 272.0625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09095319919288158, + "epoch": 2.886, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08146243542432785, + "learning_rate": 7.874916643779184e-07, + "loss": -0.0042, + "num_tokens": 12586977.0, + "reward": 12.570428848266602, + "reward_std": 3.5493857860565186, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.420412063598633, + "rewards/kidney_reward/std": 0.7131741642951965, + "rewards/length2tails_reward/mean": 0.7441259622573853, + "rewards/length2tails_reward/std": 0.3011590242385864, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.922551393508911, + "rewards/thermo_reward/std": 1.802452802658081, + "step": 1443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.09375, + "completions/mean_terminated_length": 272.09375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.0917142340913415, + "epoch": 2.888, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0717688724398613, + "learning_rate": 7.862388450078854e-07, + "loss": 0.0002, + "num_tokens": 12595716.0, + "reward": 13.552705764770508, + "reward_std": 1.637266993522644, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.534823417663574, + "rewards/kidney_reward/std": 0.5184864401817322, + "rewards/length2tails_reward/mean": 0.7994788885116577, + "rewards/length2tails_reward/std": 0.19421547651290894, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4767494201660156, + "rewards/thermo_reward/std": 1.2395474910736084, + "step": 1444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.15625, + "completions/mean_terminated_length": 271.15625, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.08649328723549843, + "epoch": 2.89, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09909166395664215, + "learning_rate": 7.84986377114362e-07, + "loss": -0.001, + "num_tokens": 12604425.0, + "reward": 13.83226203918457, + "reward_std": 0.4334369897842407, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7508825063705444, + "rewards/length2tails_reward/std": 0.30833303928375244, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08636385947465897, + "epoch": 2.892, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10228565335273743, + "learning_rate": 7.837342627567165e-07, + "loss": -0.0023, + "num_tokens": 12613141.0, + "reward": 13.536503791809082, + "reward_std": 1.9209715127944946, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.500931978225708, + "rewards/kidney_reward/std": 0.7102046608924866, + "rewards/length2tails_reward/mean": 0.7440629005432129, + "rewards/length2tails_reward/std": 0.2628605365753174, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4999806880950928, + "rewards/thermo_reward/std": 1.222486138343811, + "step": 1446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.0625, + "completions/mean_terminated_length": 272.0625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09061552211642265, + "epoch": 2.894, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5566885471343994, + "learning_rate": 7.824825039937368e-07, + "loss": -0.0042, + "num_tokens": 12621879.0, + "reward": 13.104745864868164, + "reward_std": 2.5030510425567627, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.4164295196533203, + "rewards/kidney_reward/std": 0.8949340581893921, + "rewards/length2tails_reward/mean": 0.7900067567825317, + "rewards/length2tails_reward/std": 0.22052329778671265, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.205639362335205, + "rewards/thermo_reward/std": 1.4034382104873657, + "step": 1447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 271.46875, + "completions/mean_terminated_length": 271.46875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08591014333069324, + "epoch": 2.896, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06331221014261246, + "learning_rate": 7.81231102883625e-07, + "loss": -0.001, + "num_tokens": 12630598.0, + "reward": 12.890361785888672, + "reward_std": 2.966064214706421, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.650642991065979, + "rewards/kidney_reward/mean": 2.4184765815734863, + "rewards/kidney_reward/std": 0.7438530325889587, + "rewards/length2tails_reward/mean": 0.7856767177581787, + "rewards/length2tails_reward/std": 0.23383821547031403, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.047151565551758, + "rewards/thermo_reward/std": 1.8174599409103394, + "step": 1448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.03125, + "completions/mean_terminated_length": 272.03125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08721327036619186, + "epoch": 2.898, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11579962819814682, + "learning_rate": 7.799800614839964e-07, + "loss": -0.0061, + "num_tokens": 12639335.0, + "reward": 12.824150085449219, + "reward_std": 3.6121666431427, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.3733737468719482, + "rewards/kidney_reward/std": 0.9182524681091309, + "rewards/length2tails_reward/mean": 0.740673840045929, + "rewards/length2tails_reward/std": 0.28297778964042664, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2236557006835938, + "rewards/thermo_reward/std": 1.5752650499343872, + "step": 1449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.46875, + "completions/mean_terminated_length": 272.46875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10069273272529244, + "epoch": 2.9, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08265271782875061, + "learning_rate": 7.787293818518737e-07, + "loss": -0.0047, + "num_tokens": 12648086.0, + "reward": 13.497522354125977, + "reward_std": 1.9584171772003174, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5227279663085938, + "rewards/kidney_reward/std": 0.5869088172912598, + "rewards/length2tails_reward/mean": 0.8091145157814026, + "rewards/length2tails_reward/std": 0.2377697378396988, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4326975345611572, + "rewards/thermo_reward/std": 1.3785746097564697, + "step": 1450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 0.092111611738801, + "epoch": 2.902, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11661788821220398, + "learning_rate": 7.774790660436857e-07, + "loss": 0.0035, + "num_tokens": 12656834.0, + "reward": 13.799483299255371, + "reward_std": 0.46884268522262573, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8219585418701172, + "rewards/length2tails_reward/std": 0.24238049983978271, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.9375, + "completions/mean_terminated_length": 271.9375, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.08617793815210462, + "epoch": 2.904, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10517249256372452, + "learning_rate": 7.762291161152626e-07, + "loss": 0.006, + "num_tokens": 12665568.0, + "reward": 13.836585998535156, + "reward_std": 0.4221171736717224, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7941212058067322, + "rewards/length2tails_reward/std": 0.2467154860496521, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 272.34375, + "completions/mean_terminated_length": 272.34375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09527615550905466, + "epoch": 2.906, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0897793173789978, + "learning_rate": 7.749795341218327e-07, + "loss": -0.0031, + "num_tokens": 12674315.0, + "reward": 13.245508193969727, + "reward_std": 2.678555727005005, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.462587356567383, + "rewards/kidney_reward/std": 0.7827925682067871, + "rewards/length2tails_reward/mean": 0.8029334545135498, + "rewards/length2tails_reward/std": 0.25929903984069824, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.298952341079712, + "rewards/thermo_reward/std": 1.6578506231307983, + "step": 1453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.1875, + "completions/mean_terminated_length": 273.1875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.09170580562204123, + "epoch": 2.908, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17641600966453552, + "learning_rate": 7.7373032211802e-07, + "loss": 0.0043, + "num_tokens": 12683089.0, + "reward": 12.860458374023438, + "reward_std": 4.126487731933594, + "rewards/fitness_reward/mean": 7.037131309509277, + "rewards/fitness_reward/std": 1.8331266641616821, + "rewards/kidney_reward/mean": 2.3966851234436035, + "rewards/kidney_reward/std": 0.8604899048805237, + "rewards/length2tails_reward/mean": 0.854651153087616, + "rewards/length2tails_reward/std": 0.21645672619342804, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.241177558898926, + "rewards/thermo_reward/std": 1.6034048795700073, + "step": 1454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.28125, + "completions/mean_terminated_length": 272.28125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09060567431151867, + "epoch": 2.91, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19656902551651, + "learning_rate": 7.724814821578395e-07, + "loss": 0.001, + "num_tokens": 12691834.0, + "reward": 12.79423999786377, + "reward_std": 4.301056861877441, + "rewards/fitness_reward/mean": 7.017977237701416, + "rewards/fitness_reward/std": 1.9414762258529663, + "rewards/kidney_reward/mean": 2.345012664794922, + "rewards/kidney_reward/std": 1.0418504476547241, + "rewards/length2tails_reward/mean": 0.7869172096252441, + "rewards/length2tails_reward/std": 0.2412094920873642, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2525582313537598, + "rewards/thermo_reward/std": 1.6153535842895508, + "step": 1455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.21875, + "completions/mean_terminated_length": 272.21875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.0849033072590828, + "epoch": 2.912, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07628481835126877, + "learning_rate": 7.712330162946948e-07, + "loss": 0.0008, + "num_tokens": 12700577.0, + "reward": 13.83536434173584, + "reward_std": 0.4307062029838562, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7819132208824158, + "rewards/length2tails_reward/std": 0.2246393859386444, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 271.90625, + "completions/mean_terminated_length": 271.90625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09288029000163078, + "epoch": 2.914, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08784768730401993, + "learning_rate": 7.699849265813743e-07, + "loss": -0.0026, + "num_tokens": 12709310.0, + "reward": 13.766115188598633, + "reward_std": 0.5586167573928833, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7618662118911743, + "rewards/length2tails_reward/std": 0.27308955788612366, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.875, + "completions/mean_terminated_length": 269.875, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "entropy": 0.08489096444100142, + "epoch": 2.916, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07928179949522018, + "learning_rate": 7.687372150700479e-07, + "loss": 0.001, + "num_tokens": 12717978.0, + "reward": 13.911661148071289, + "reward_std": 0.3134597837924957, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7471531629562378, + "rewards/length2tails_reward/std": 0.2685222327709198, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.09375, + "completions/mean_terminated_length": 272.09375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.07988280057907104, + "epoch": 2.918, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0837949886918068, + "learning_rate": 7.674898838122638e-07, + "loss": 0.0008, + "num_tokens": 12726717.0, + "reward": 13.914548873901367, + "reward_std": 0.3166908025741577, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7760345339775085, + "rewards/length2tails_reward/std": 0.21083956956863403, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.8125, + "completions/mean_terminated_length": 273.8125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09522326570004225, + "epoch": 2.92, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11612401902675629, + "learning_rate": 7.662429348589446e-07, + "loss": -0.0007, + "num_tokens": 12735511.0, + "reward": 13.207071304321289, + "reward_std": 2.0308852195739746, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4523303508758545, + "rewards/kidney_reward/std": 0.6995065212249756, + "rewards/length2tails_reward/mean": 0.8716757297515869, + "rewards/length2tails_reward/std": 0.21875134110450745, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.206387996673584, + "rewards/thermo_reward/std": 1.3996530771255493, + "step": 1460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.8125, + "completions/mean_terminated_length": 271.8125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.085783283226192, + "epoch": 2.922, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10536955296993256, + "learning_rate": 7.649963702603848e-07, + "loss": 0.0012, + "num_tokens": 12744241.0, + "reward": 13.83349609375, + "reward_std": 0.429172158241272, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7632323503494263, + "rewards/length2tails_reward/std": 0.25650447607040405, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.09125344175845385, + "epoch": 2.924, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08487284928560257, + "learning_rate": 7.637501920662468e-07, + "loss": -0.0004, + "num_tokens": 12752999.0, + "reward": 13.840091705322266, + "reward_std": 0.42784279584884644, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8291807174682617, + "rewards/length2tails_reward/std": 0.18743930757045746, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.90625, + "completions/mean_terminated_length": 272.90625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09677456505596638, + "epoch": 2.926, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09227211773395538, + "learning_rate": 7.62504402325557e-07, + "loss": -0.0005, + "num_tokens": 12761764.0, + "reward": 13.854175567626953, + "reward_std": 0.4863269329071045, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8447530269622803, + "rewards/length2tails_reward/std": 0.19483210146427155, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.125, + "completions/mean_terminated_length": 271.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08659609872847795, + "epoch": 2.928, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06487475335597992, + "learning_rate": 7.612590030867039e-07, + "loss": -0.0053, + "num_tokens": 12770472.0, + "reward": 13.283210754394531, + "reward_std": 2.0315680503845215, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.504581928253174, + "rewards/kidney_reward/std": 0.5519188642501831, + "rewards/length2tails_reward/mean": 0.6866386532783508, + "rewards/length2tails_reward/std": 0.31638386845588684, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2487802505493164, + "rewards/thermo_reward/std": 1.5519336462020874, + "step": 1464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.40625, + "completions/mean_terminated_length": 272.40625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08706388808786869, + "epoch": 2.93, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21007883548736572, + "learning_rate": 7.60013996397434e-07, + "loss": -0.0046, + "num_tokens": 12779221.0, + "reward": 13.285353660583496, + "reward_std": 2.5396311283111572, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.4273016452789307, + "rewards/kidney_reward/std": 0.9792560338973999, + "rewards/length2tails_reward/mean": 0.7961028218269348, + "rewards/length2tails_reward/std": 0.23903968930244446, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.374765634536743, + "rewards/thermo_reward/std": 1.2762339115142822, + "step": 1465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.125, + "completions/mean_terminated_length": 272.125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08097695140168071, + "epoch": 2.932, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08966746181249619, + "learning_rate": 7.587693843048474e-07, + "loss": 0.0017, + "num_tokens": 12787961.0, + "reward": 13.874987602233887, + "reward_std": 0.3732488453388214, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7792847156524658, + "rewards/length2tails_reward/std": 0.24024541676044464, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.34375, + "completions/mean_terminated_length": 270.34375, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.09143519587814808, + "epoch": 2.934, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23096249997615814, + "learning_rate": 7.575251688553963e-07, + "loss": 0.0076, + "num_tokens": 12796644.0, + "reward": 13.87496566772461, + "reward_std": 0.37357449531555176, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7790652513504028, + "rewards/length2tails_reward/std": 0.3010183274745941, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.53125, + "completions/mean_terminated_length": 272.53125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08806372992694378, + "epoch": 2.936, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09937624633312225, + "learning_rate": 7.5628135209488e-07, + "loss": 0.0019, + "num_tokens": 12805397.0, + "reward": 13.958802223205566, + "reward_std": 0.22333241999149323, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.81971675157547, + "rewards/length2tails_reward/std": 0.19025535881519318, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7891664505004883, + "rewards/thermo_reward/std": 0.225629061460495, + "step": 1468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.96875, + "completions/mean_terminated_length": 273.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0997593542560935, + "epoch": 2.9379999999999997, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09114154428243637, + "learning_rate": 7.550379360684434e-07, + "loss": 0.0028, + "num_tokens": 12814196.0, + "reward": 13.84214973449707, + "reward_std": 0.4247521460056305, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8497633934020996, + "rewards/length2tails_reward/std": 0.2314579039812088, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.53125, + "completions/mean_terminated_length": 270.53125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08079738728702068, + "epoch": 2.94, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11101573705673218, + "learning_rate": 7.537949228205708e-07, + "loss": -0.0035, + "num_tokens": 12822885.0, + "reward": 13.57027816772461, + "reward_std": 1.9160974025726318, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.5229597091674805, + "rewards/kidney_reward/std": 0.5855976939201355, + "rewards/length2tails_reward/mean": 0.6812267303466797, + "rewards/length2tails_reward/std": 0.30966007709503174, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5755205154418945, + "rewards/thermo_reward/std": 1.0178793668746948, + "step": 1470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.28125, + "completions/mean_terminated_length": 272.28125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08315122686326504, + "epoch": 2.942, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1604270339012146, + "learning_rate": 7.525523143950858e-07, + "loss": 0.0001, + "num_tokens": 12831630.0, + "reward": 12.32827377319336, + "reward_std": 6.565638065338135, + "rewards/fitness_reward/mean": 6.622296333312988, + "rewards/fitness_reward/std": 2.912938117980957, + "rewards/kidney_reward/mean": 2.213806390762329, + "rewards/kidney_reward/std": 1.6312694549560547, + "rewards/length2tails_reward/mean": 0.7732738256454468, + "rewards/length2tails_reward/std": 0.273733913898468, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3148441314697266, + "rewards/thermo_reward/std": 2.026035785675049, + "step": 1471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.08468564599752426, + "epoch": 2.944, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07568921148777008, + "learning_rate": 7.513101128351453e-07, + "loss": -0.0028, + "num_tokens": 12840378.0, + "reward": 13.400533676147461, + "reward_std": 2.264967203140259, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.509674072265625, + "rewards/kidney_reward/std": 0.6607514023780823, + "rewards/length2tails_reward/mean": 0.7946181297302246, + "rewards/length2tails_reward/std": 0.21099165081977844, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.407722234725952, + "rewards/thermo_reward/std": 1.3061423301696777, + "step": 1472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.1875, + "completions/mean_terminated_length": 271.1875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08182617742568254, + "epoch": 2.9459999999999997, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09725268185138702, + "learning_rate": 7.500683201832382e-07, + "loss": -0.0047, + "num_tokens": 12849088.0, + "reward": 13.759465217590332, + "reward_std": 0.5583043098449707, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.6953686475753784, + "rewards/length2tails_reward/std": 0.29025208950042725, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 274.15625, + "completions/mean_terminated_length": 274.15625, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.08809699164703488, + "epoch": 2.948, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12728384137153625, + "learning_rate": 7.488269384811799e-07, + "loss": 0.0048, + "num_tokens": 12857893.0, + "reward": 13.71151351928711, + "reward_std": 0.6438538432121277, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8883048892021179, + "rewards/length2tails_reward/std": 0.11851312965154648, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897367000579834, + "rewards/thermo_reward/std": 0.5061467885971069, + "step": 1474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08566201105713844, + "epoch": 2.95, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16148102283477783, + "learning_rate": 7.475859697701109e-07, + "loss": -0.0034, + "num_tokens": 12866614.0, + "reward": 13.455978393554688, + "reward_std": 1.2935900688171387, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7256759405136108, + "rewards/length2tails_reward/std": 0.26900699734687805, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.323105812072754, + "rewards/thermo_reward/std": 1.1443455219268799, + "step": 1475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.84375, + "completions/mean_terminated_length": 272.84375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09163531567901373, + "epoch": 2.952, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10324077308177948, + "learning_rate": 7.463454160904927e-07, + "loss": -0.0018, + "num_tokens": 12875377.0, + "reward": 13.806316375732422, + "reward_std": 0.47567877173423767, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8902928829193115, + "rewards/length2tails_reward/std": 0.12258625030517578, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.09375, + "completions/mean_terminated_length": 273.09375, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.08791532320901752, + "epoch": 2.9539999999999997, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32206058502197266, + "learning_rate": 7.451052794821039e-07, + "loss": 0.0041, + "num_tokens": 12884148.0, + "reward": 13.934386253356934, + "reward_std": 0.3780563771724701, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8491396903991699, + "rewards/length2tails_reward/std": 0.1631694883108139, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7891664505004883, + "rewards/thermo_reward/std": 0.225629061460495, + "step": 1477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.375, + "completions/mean_terminated_length": 273.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08659681957215071, + "epoch": 2.956, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07478148490190506, + "learning_rate": 7.438655619840375e-07, + "loss": 0.0005, + "num_tokens": 12892928.0, + "reward": 13.849531173706055, + "reward_std": 0.48496684432029724, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7983073592185974, + "rewards/length2tails_reward/std": 0.2785865366458893, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.65625, + "completions/mean_terminated_length": 272.65625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08749987743794918, + "epoch": 2.958, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1306840181350708, + "learning_rate": 7.426262656346978e-07, + "loss": 0.0027, + "num_tokens": 12901685.0, + "reward": 13.835283279418945, + "reward_std": 0.4283444285392761, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7810969352722168, + "rewards/length2tails_reward/std": 0.24721378087997437, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.65625, + "completions/mean_terminated_length": 272.65625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08540861960500479, + "epoch": 2.96, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14198768138885498, + "learning_rate": 7.413873924717956e-07, + "loss": 0.001, + "num_tokens": 12910442.0, + "reward": 12.985288619995117, + "reward_std": 2.4996986389160156, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.452592670917511, + "rewards/kidney_reward/mean": 2.395813465118408, + "rewards/kidney_reward/std": 0.8154330849647522, + "rewards/length2tails_reward/mean": 0.7963902950286865, + "rewards/length2tails_reward/std": 0.2482798546552658, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.163670063018799, + "rewards/thermo_reward/std": 1.6197998523712158, + "step": 1480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.03125, + "completions/mean_terminated_length": 273.03125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09230809472501278, + "epoch": 2.9619999999999997, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18161533772945404, + "learning_rate": 7.401489445323472e-07, + "loss": -0.0017, + "num_tokens": 12919211.0, + "reward": 13.677942276000977, + "reward_std": 0.5635209083557129, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8031283617019653, + "rewards/length2tails_reward/std": 0.25591933727264404, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.50996470451355, + "rewards/thermo_reward/std": 0.5615194439888, + "step": 1481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.03125, + "completions/mean_terminated_length": 272.03125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08371663466095924, + "epoch": 2.964, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09580846130847931, + "learning_rate": 7.389109238526685e-07, + "loss": -0.0065, + "num_tokens": 12927948.0, + "reward": 13.760322570800781, + "reward_std": 1.1355624198913574, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7880313396453857, + "rewards/length2tails_reward/std": 0.24053345620632172, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.593855619430542, + "rewards/thermo_reward/std": 1.1204947233200073, + "step": 1482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 272.4375, + "completions/mean_terminated_length": 272.4375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09112018626183271, + "epoch": 2.966, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11008063703775406, + "learning_rate": 7.376733324683739e-07, + "loss": 0.0004, + "num_tokens": 12936698.0, + "reward": 13.719146728515625, + "reward_std": 0.5334470868110657, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.816321611404419, + "rewards/length2tails_reward/std": 0.219462051987648, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5498504638671875, + "rewards/thermo_reward/std": 0.5360844731330872, + "step": 1483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 272.21875, + "completions/mean_terminated_length": 272.21875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09244338981807232, + "epoch": 2.968, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14250656962394714, + "learning_rate": 7.364361724143713e-07, + "loss": 0.0016, + "num_tokens": 12945441.0, + "reward": 13.672819137573242, + "reward_std": 0.9924048781394958, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7941070795059204, + "rewards/length2tails_reward/std": 0.2598108947277069, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.533102512359619, + "rewards/thermo_reward/std": 0.8569762706756592, + "step": 1484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.4375, + "completions/mean_terminated_length": 270.4375, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 0.08355463948100805, + "epoch": 2.9699999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37764373421669006, + "learning_rate": 7.351994457248595e-07, + "loss": -0.0126, + "num_tokens": 12954127.0, + "reward": 13.909736633300781, + "reward_std": 0.31327277421951294, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7279137969017029, + "rewards/length2tails_reward/std": 0.26513198018074036, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 271.4375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09196582902222872, + "epoch": 2.972, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11685548722743988, + "learning_rate": 7.33963154433325e-07, + "loss": -0.0045, + "num_tokens": 12962845.0, + "reward": 13.807863235473633, + "reward_std": 1.0549274682998657, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7532917857170105, + "rewards/length2tails_reward/std": 0.27080824971199036, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.6722283363342285, + "rewards/thermo_reward/std": 0.8871302604675293, + "step": 1486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.125, + "completions/mean_terminated_length": 273.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0840097526088357, + "epoch": 2.974, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09556731581687927, + "learning_rate": 7.327273005725378e-07, + "loss": 0.0022, + "num_tokens": 12971617.0, + "reward": 13.643620491027832, + "reward_std": 1.7801275253295898, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.527078866958618, + "rewards/kidney_reward/std": 0.5622953772544861, + "rewards/length2tails_reward/mean": 0.8075554966926575, + "rewards/length2tails_reward/std": 0.2638454735279083, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.574601173400879, + "rewards/thermo_reward/std": 1.2273809909820557, + "step": 1487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.53125, + "completions/mean_terminated_length": 273.53125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08960465341806412, + "epoch": 2.976, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16316860914230347, + "learning_rate": 7.314918861745491e-07, + "loss": 0.0006, + "num_tokens": 12980402.0, + "reward": 13.456883430480957, + "reward_std": 1.9875643253326416, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.508101224899292, + "rewards/kidney_reward/std": 0.6696491241455078, + "rewards/length2tails_reward/mean": 0.8753653764724731, + "rewards/length2tails_reward/std": 0.1597120761871338, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4575695991516113, + "rewards/thermo_reward/std": 1.0479161739349365, + "step": 1488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.0, + "completions/mean_terminated_length": 271.0, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.0878768339753151, + "epoch": 2.9779999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10435619950294495, + "learning_rate": 7.302569132706881e-07, + "loss": -0.0044, + "num_tokens": 12989106.0, + "reward": 13.693092346191406, + "reward_std": 1.0937471389770508, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.650642991065979, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7829598188400269, + "rewards/length2tails_reward/std": 0.27791085839271545, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.71875, + "completions/mean_terminated_length": 270.71875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09225155226886272, + "epoch": 2.98, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08390070497989655, + "learning_rate": 7.290223838915568e-07, + "loss": 0.0025, + "num_tokens": 12997801.0, + "reward": 13.758573532104492, + "reward_std": 0.5511883497238159, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.6864538192749023, + "rewards/length2tails_reward/std": 0.25705966353416443, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.28125, + "completions/mean_terminated_length": 271.28125, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "entropy": 0.08846852649003267, + "epoch": 2.982, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10731520503759384, + "learning_rate": 7.277883000670288e-07, + "loss": -0.0101, + "num_tokens": 13006514.0, + "reward": 13.743173599243164, + "reward_std": 0.8424339294433594, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8157355189323425, + "rewards/length2tails_reward/std": 0.1975812315940857, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5739362239837646, + "rewards/thermo_reward/std": 0.8346105217933655, + "step": 1491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.21875, + "completions/mean_terminated_length": 272.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09590337425470352, + "epoch": 2.984, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11507551372051239, + "learning_rate": 7.26554663826245e-07, + "loss": 0.0046, + "num_tokens": 13015257.0, + "reward": 13.836891174316406, + "reward_std": 0.4224011301994324, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7971822023391724, + "rewards/length2tails_reward/std": 0.2093581259250641, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.4375, + "completions/mean_terminated_length": 272.4375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08532546181231737, + "epoch": 2.9859999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.72013258934021, + "learning_rate": 7.2532147719761e-07, + "loss": 0.0053, + "num_tokens": 13024007.0, + "reward": 12.834997177124023, + "reward_std": 4.0108962059021, + "rewards/fitness_reward/mean": 6.9856367111206055, + "rewards/fitness_reward/std": 1.8179237842559814, + "rewards/kidney_reward/mean": 2.371399402618408, + "rewards/kidney_reward/std": 1.0544934272766113, + "rewards/length2tails_reward/mean": 0.7837235927581787, + "rewards/length2tails_reward/std": 0.26284104585647583, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2995893955230713, + "rewards/thermo_reward/std": 1.2621272802352905, + "step": 1493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.03125, + "completions/mean_terminated_length": 270.03125, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "entropy": 0.0831696605309844, + "epoch": 2.988, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.190970778465271, + "learning_rate": 7.240887422087891e-07, + "loss": -0.0258, + "num_tokens": 13032680.0, + "reward": 13.309157371520996, + "reward_std": 2.8543763160705566, + "rewards/fitness_reward/mean": 6.997875690460205, + "rewards/fitness_reward/std": 2.0551881790161133, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8210108876228333, + "rewards/length2tails_reward/std": 0.20082461833953857, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.530060291290283, + "rewards/thermo_reward/std": 0.8719301223754883, + "step": 1494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.34375, + "completions/mean_terminated_length": 273.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08507030922919512, + "epoch": 2.99, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08497648686170578, + "learning_rate": 7.228564608867061e-07, + "loss": 0.0068, + "num_tokens": 13041459.0, + "reward": 13.772529602050781, + "reward_std": 0.5473827719688416, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8260170221328735, + "rewards/length2tails_reward/std": 0.22926293313503265, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.6296226978302, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.75, + "completions/mean_terminated_length": 272.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08261603862047195, + "epoch": 2.992, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10478515177965164, + "learning_rate": 7.216246352575369e-07, + "loss": -0.0042, + "num_tokens": 13050219.0, + "reward": 13.836133003234863, + "reward_std": 0.43924397230148315, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.789600133895874, + "rewards/length2tails_reward/std": 0.2798590064048767, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 0.09019718458876014, + "epoch": 2.9939999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09406401962041855, + "learning_rate": 7.2039326734671e-07, + "loss": -0.0024, + "num_tokens": 13058954.0, + "reward": 13.52038288116455, + "reward_std": 2.2548234462738037, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.512369394302368, + "rewards/kidney_reward/std": 0.6455049514770508, + "rewards/length2tails_reward/mean": 0.7954090237617493, + "rewards/length2tails_reward/std": 0.24171698093414307, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.524796962738037, + "rewards/thermo_reward/std": 1.2938830852508545, + "step": 1497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.25, + "completions/mean_terminated_length": 273.25, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.07924879249185324, + "epoch": 2.996, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06274835765361786, + "learning_rate": 7.191623591789005e-07, + "loss": -0.0041, + "num_tokens": 13067730.0, + "reward": 13.637685775756836, + "reward_std": 1.2025954723358154, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8336925506591797, + "rewards/length2tails_reward/std": 0.2251635044813156, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4940104484558105, + "rewards/thermo_reward/std": 1.054375171661377, + "step": 1498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 273.75, + "completions/mean_terminated_length": 273.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08210170036181808, + "epoch": 2.998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18217602372169495, + "learning_rate": 7.179319127780274e-07, + "loss": 0.002, + "num_tokens": 13076522.0, + "reward": 13.59107494354248, + "reward_std": 1.8719508647918701, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.5338897705078125, + "rewards/kidney_reward/std": 0.5237670540809631, + "rewards/length2tails_reward/mean": 0.8339365720748901, + "rewards/length2tails_reward/std": 0.22332796454429626, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.570115804672241, + "rewards/thermo_reward/std": 1.0470080375671387, + "step": 1499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.8125, + "completions/mean_terminated_length": 273.8125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08131207153201103, + "epoch": 3.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11286388337612152, + "learning_rate": 7.167019301672508e-07, + "loss": 0.0017, + "num_tokens": 13085316.0, + "reward": 13.719484329223633, + "reward_std": 0.9667587876319885, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8409682512283325, + "rewards/length2tails_reward/std": 0.2269384115934372, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5750818252563477, + "rewards/thermo_reward/std": 0.8288360238075256, + "step": 1500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08663884736597538, + "epoch": 3.002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07008332759141922, + "learning_rate": 7.154724133689676e-07, + "loss": -0.0032, + "num_tokens": 13094047.0, + "reward": 13.87397575378418, + "reward_std": 0.384539395570755, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7691654562950134, + "rewards/length2tails_reward/std": 0.2182990461587906, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.4375, + "completions/mean_terminated_length": 272.4375, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "entropy": 0.08829734660685062, + "epoch": 3.004, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10279041528701782, + "learning_rate": 7.142433644048098e-07, + "loss": 0.0024, + "num_tokens": 13102797.0, + "reward": 13.54155158996582, + "reward_std": 1.5787534713745117, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4996724128723145, + "rewards/kidney_reward/std": 0.5786310434341431, + "rewards/length2tails_reward/mean": 0.8198412656784058, + "rewards/length2tails_reward/std": 0.2560720145702362, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4987106323242188, + "rewards/thermo_reward/std": 1.0301309823989868, + "step": 1502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 271.65625, + "completions/mean_terminated_length": 271.65625, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.10298821609467268, + "epoch": 3.006, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1507360190153122, + "learning_rate": 7.130147852956394e-07, + "loss": 0.0024, + "num_tokens": 13111522.0, + "reward": 13.954971313476562, + "reward_std": 0.22310635447502136, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7814017534255981, + "rewards/length2tails_reward/std": 0.24012959003448486, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7891664505004883, + "rewards/thermo_reward/std": 0.225629061460495, + "step": 1503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 272.5625, + "completions/mean_terminated_length": 272.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08291113376617432, + "epoch": 3.008, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09166242182254791, + "learning_rate": 7.11786678061546e-07, + "loss": 0.0017, + "num_tokens": 13120276.0, + "reward": 13.839548110961914, + "reward_std": 0.4274718165397644, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8237426280975342, + "rewards/length2tails_reward/std": 0.223821222782135, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 276.78125, + "completions/mean_terminated_length": 276.78125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09564272407442331, + "epoch": 3.01, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13374081254005432, + "learning_rate": 7.105590447218437e-07, + "loss": -0.0031, + "num_tokens": 13129165.0, + "reward": 13.877220153808594, + "reward_std": 0.3734629452228546, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.801613986492157, + "rewards/length2tails_reward/std": 0.25205737352371216, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.09375, + "completions/mean_terminated_length": 272.09375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08292873064056039, + "epoch": 3.012, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2027139514684677, + "learning_rate": 7.093318872950664e-07, + "loss": -0.0017, + "num_tokens": 13137904.0, + "reward": 13.6582670211792, + "reward_std": 0.6689882874488831, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7547006607055664, + "rewards/length2tails_reward/std": 0.28135377168655396, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5498504638671875, + "rewards/thermo_reward/std": 0.5360844731330872, + "step": 1506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.15625, + "completions/mean_terminated_length": 273.15625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08827351219952106, + "epoch": 3.014, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10204388946294785, + "learning_rate": 7.081052077989667e-07, + "loss": -0.0048, + "num_tokens": 13146677.0, + "reward": 13.630050659179688, + "reward_std": 1.1362738609313965, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.650642991065979, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8250014781951904, + "rewards/length2tails_reward/std": 0.24018093943595886, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.0, + "completions/mean_terminated_length": 272.0, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0833295239135623, + "epoch": 3.016, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1260230839252472, + "learning_rate": 7.068790082505107e-07, + "loss": -0.0019, + "num_tokens": 13155413.0, + "reward": 13.230853080749512, + "reward_std": 1.622039794921875, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7348164916038513, + "rewards/length2tails_reward/std": 0.3072350323200226, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.124424934387207, + "rewards/thermo_reward/std": 1.4224731922149658, + "step": 1508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.28125, + "completions/mean_terminated_length": 273.28125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08457625657320023, + "epoch": 3.018, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12385375797748566, + "learning_rate": 7.056532906658752e-07, + "loss": 0.0009, + "num_tokens": 13164190.0, + "reward": 13.543134689331055, + "reward_std": 1.7194054126739502, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8602433800697327, + "rewards/length2tails_reward/std": 0.20957812666893005, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4543137550354004, + "rewards/thermo_reward/std": 1.2629039287567139, + "step": 1509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.1875, + "completions/mean_terminated_length": 272.1875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.08766997698694468, + "epoch": 3.02, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1102355569601059, + "learning_rate": 7.04428057060445e-07, + "loss": -0.0051, + "num_tokens": 13172932.0, + "reward": 13.820863723754883, + "reward_std": 0.574556827545166, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7852233648300171, + "rewards/length2tails_reward/std": 0.20834431052207947, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09607646521180868, + "epoch": 3.022, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08452524989843369, + "learning_rate": 7.032033094488093e-07, + "loss": -0.0032, + "num_tokens": 13181676.0, + "reward": 13.797369956970215, + "reward_std": 0.47734662890434265, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8008310794830322, + "rewards/length2tails_reward/std": 0.2595970034599304, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.90625, + "completions/mean_terminated_length": 272.90625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09418830275535583, + "epoch": 3.024, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1552160084247589, + "learning_rate": 7.019790498447571e-07, + "loss": 0.0019, + "num_tokens": 13190441.0, + "reward": 13.758134841918945, + "reward_std": 0.49969205260276794, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8073362112045288, + "rewards/length2tails_reward/std": 0.2395055890083313, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 1512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.9375, + "completions/mean_terminated_length": 270.9375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0807279646396637, + "epoch": 3.026, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10100645571947098, + "learning_rate": 7.007552802612764e-07, + "loss": 0.0006, + "num_tokens": 13199143.0, + "reward": 13.867382049560547, + "reward_std": 0.3748721182346344, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.70323646068573, + "rewards/length2tails_reward/std": 0.28744640946388245, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 274.6875, + "completions/mean_terminated_length": 274.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08788215974345803, + "epoch": 3.028, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13024696707725525, + "learning_rate": 6.995320027105481e-07, + "loss": -0.0022, + "num_tokens": 13207965.0, + "reward": 13.626249313354492, + "reward_std": 1.1537022590637207, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8483192920684814, + "rewards/length2tails_reward/std": 0.22150689363479614, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5084705352783203, + "rewards/thermo_reward/std": 0.9801769852638245, + "step": 1514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 271.5, + "completions/mean_terminated_length": 271.5, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09518216829746962, + "epoch": 3.03, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09895037114620209, + "learning_rate": 6.983092192039455e-07, + "loss": 0.0021, + "num_tokens": 13216685.0, + "reward": 13.582860946655273, + "reward_std": 1.8713279962539673, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.538771867752075, + "rewards/kidney_reward/std": 0.49614986777305603, + "rewards/length2tails_reward/mean": 0.7530010342597961, + "rewards/length2tails_reward/std": 0.24346208572387695, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5076041221618652, + "rewards/thermo_reward/std": 1.3884414434432983, + "step": 1515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.5, + "completions/mean_terminated_length": 273.5, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08413621364161372, + "epoch": 3.032, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10287663340568542, + "learning_rate": 6.970869317520279e-07, + "loss": 0.0008, + "num_tokens": 13225469.0, + "reward": 13.882369995117188, + "reward_std": 0.38048040866851807, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8531134128570557, + "rewards/length2tails_reward/std": 0.20098592340946198, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.3125, + "completions/mean_terminated_length": 272.3125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08602802455425262, + "epoch": 3.034, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12859375774860382, + "learning_rate": 6.958651423645407e-07, + "loss": -0.0047, + "num_tokens": 13234215.0, + "reward": 13.630465507507324, + "reward_std": 0.917415976524353, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7645262479782104, + "rewards/length2tails_reward/std": 0.28300315141677856, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4937076568603516, + "rewards/thermo_reward/std": 0.8699823021888733, + "step": 1517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 287.40625, + "completions/mean_terminated_length": 272.3548278808594, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10801392560824752, + "epoch": 3.036, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9083175659179688, + "learning_rate": 6.946438530504093e-07, + "loss": -0.0247, + "num_tokens": 13243444.0, + "reward": 13.646073341369629, + "reward_std": 0.9348779320716858, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7570778727531433, + "rewards/length2tails_reward/std": 0.2936254143714905, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.482700824737549, + "rewards/thermo_reward/std": 0.9230156540870667, + "step": 1518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.5625, + "completions/mean_terminated_length": 270.5625, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.08214408811181784, + "epoch": 3.038, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1576862931251526, + "learning_rate": 6.934230658177372e-07, + "loss": 0.0011, + "num_tokens": 13252134.0, + "reward": 13.114895820617676, + "reward_std": 4.055248260498047, + "rewards/fitness_reward/mean": 7.030417442321777, + "rewards/fitness_reward/std": 1.8711055517196655, + "rewards/kidney_reward/mean": 2.4575986862182617, + "rewards/kidney_reward/std": 0.9553350806236267, + "rewards/length2tails_reward/mean": 0.7602359652519226, + "rewards/length2tails_reward/std": 0.27951622009277344, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4508557319641113, + "rewards/thermo_reward/std": 1.2813252210617065, + "step": 1519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.08832096960395575, + "epoch": 3.04, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07851015776395798, + "learning_rate": 6.922027826738017e-07, + "loss": -0.0059, + "num_tokens": 13260855.0, + "reward": 13.069082260131836, + "reward_std": 2.6473140716552734, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.45493483543396, + "rewards/kidney_reward/std": 0.6855012774467468, + "rewards/length2tails_reward/mean": 0.7882065773010254, + "rewards/length2tails_reward/std": 0.26846760511398315, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.131650447845459, + "rewards/thermo_reward/std": 1.86585533618927, + "step": 1520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.65625, + "completions/mean_terminated_length": 271.65625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08753087185323238, + "epoch": 3.042, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09079719334840775, + "learning_rate": 6.909830056250526e-07, + "loss": 0.0031, + "num_tokens": 13269580.0, + "reward": 13.12080192565918, + "reward_std": 2.2184596061706543, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.354142189025879, + "rewards/kidney_reward/std": 0.8856872916221619, + "rewards/length2tails_reward/mean": 0.706944465637207, + "rewards/length2tails_reward/std": 0.328614205121994, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2922892570495605, + "rewards/thermo_reward/std": 1.2992886304855347, + "step": 1521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.5, + "completions/mean_terminated_length": 272.5, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09137775283306837, + "epoch": 3.044, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13857701420783997, + "learning_rate": 6.897637366771066e-07, + "loss": 0.0034, + "num_tokens": 13278332.0, + "reward": 13.755581855773926, + "reward_std": 0.5007432103157043, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.781810998916626, + "rewards/length2tails_reward/std": 0.25280410051345825, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 1522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "entropy": 0.0889419955201447, + "epoch": 3.046, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06803309172391891, + "learning_rate": 6.885449778347448e-07, + "loss": -0.0041, + "num_tokens": 13287063.0, + "reward": 13.599407196044922, + "reward_std": 1.1936312913894653, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7993759512901306, + "rewards/length2tails_reward/std": 0.2781977951526642, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4591641426086426, + "rewards/thermo_reward/std": 1.0398609638214111, + "step": 1523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.65625, + "completions/mean_terminated_length": 271.65625, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.08468360500410199, + "epoch": 3.048, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10303880274295807, + "learning_rate": 6.8732673110191e-07, + "loss": 0.0065, + "num_tokens": 13295788.0, + "reward": 13.873723983764648, + "reward_std": 0.3729260563850403, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7666493654251099, + "rewards/length2tails_reward/std": 0.28587543964385986, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.8125, + "completions/mean_terminated_length": 271.8125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.07929568085819483, + "epoch": 3.05, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08844693750143051, + "learning_rate": 6.861089984817032e-07, + "loss": -0.0001, + "num_tokens": 13304518.0, + "reward": 13.651402473449707, + "reward_std": 0.730636477470398, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5263874530792236, + "rewards/kidney_reward/std": 0.5662070512771606, + "rewards/length2tails_reward/mean": 0.7409303784370422, + "rewards/length2tails_reward/std": 0.30849483609199524, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 1525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.28125, + "completions/mean_terminated_length": 272.28125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09142199764028192, + "epoch": 3.052, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08430390805006027, + "learning_rate": 6.848917819763793e-07, + "loss": 0.002, + "num_tokens": 13313263.0, + "reward": 13.95113754272461, + "reward_std": 0.222835972905159, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7430736422538757, + "rewards/length2tails_reward/std": 0.27778905630111694, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7891664505004883, + "rewards/thermo_reward/std": 0.225629061460495, + "step": 1526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08689901698380709, + "epoch": 3.054, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2107875794172287, + "learning_rate": 6.836750835873453e-07, + "loss": -0.0049, + "num_tokens": 13322011.0, + "reward": 13.468440055847168, + "reward_std": 3.0067617893218994, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.5390896797180176, + "rewards/kidney_reward/std": 0.49435171484947205, + "rewards/length2tails_reward/mean": 0.806360125541687, + "rewards/length2tails_reward/std": 0.24003668129444122, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.69566011428833, + "rewards/thermo_reward/std": 0.7545809149742126, + "step": 1527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.125, + "completions/mean_terminated_length": 270.125, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "entropy": 0.09294356871396303, + "epoch": 3.056, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1360463947057724, + "learning_rate": 6.824589053151557e-07, + "loss": -0.0019, + "num_tokens": 13330687.0, + "reward": 13.681583404541016, + "reward_std": 0.9170136451721191, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7322138547897339, + "rewards/length2tails_reward/std": 0.2564483880996704, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.520697593688965, + "rewards/thermo_reward/std": 0.9184462428092957, + "step": 1528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.1875, + "completions/mean_terminated_length": 271.1875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.07564508123323321, + "epoch": 3.058, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09548187255859375, + "learning_rate": 6.812432491595102e-07, + "loss": -0.0056, + "num_tokens": 13339397.0, + "reward": 13.341715812683105, + "reward_std": 2.5133228302001953, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.447655439376831, + "rewards/kidney_reward/std": 1.011582612991333, + "rewards/length2tails_reward/mean": 0.7018471360206604, + "rewards/length2tails_reward/std": 0.3438743054866791, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3626902103424072, + "rewards/thermo_reward/std": 1.6830047369003296, + "step": 1529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08310075104236603, + "epoch": 3.06, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10243958979845047, + "learning_rate": 6.800281171192501e-07, + "loss": 0.0017, + "num_tokens": 13348161.0, + "reward": 13.800646781921387, + "reward_std": 0.4690954089164734, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8335978984832764, + "rewards/length2tails_reward/std": 0.16988824307918549, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 272.59375, + "completions/mean_terminated_length": 272.59375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09586561750620604, + "epoch": 3.062, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11107802391052246, + "learning_rate": 6.788135111923545e-07, + "loss": -0.0035, + "num_tokens": 13356916.0, + "reward": 13.770654678344727, + "reward_std": 0.5631916522979736, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8072643876075745, + "rewards/length2tails_reward/std": 0.28409481048583984, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.15625, + "completions/mean_terminated_length": 273.15625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08622128795832396, + "epoch": 3.064, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08057373017072678, + "learning_rate": 6.775994333759378e-07, + "loss": -0.0001, + "num_tokens": 13365689.0, + "reward": 13.850677490234375, + "reward_std": 0.4851415753364563, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.809777021408081, + "rewards/length2tails_reward/std": 0.24094976484775543, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.07591812824830413, + "epoch": 3.066, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10544803738594055, + "learning_rate": 6.763858856662457e-07, + "loss": -0.0028, + "num_tokens": 13374447.0, + "reward": 13.523006439208984, + "reward_std": 1.351911187171936, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5063438415527344, + "rewards/kidney_reward/std": 0.5423585772514343, + "rewards/length2tails_reward/mean": 0.7550817728042603, + "rewards/length2tails_reward/std": 0.2964111268520355, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4799704551696777, + "rewards/thermo_reward/std": 1.1274259090423584, + "step": 1533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.59375, + "completions/mean_terminated_length": 272.59375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.0872881724499166, + "epoch": 3.068, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10545146465301514, + "learning_rate": 6.751728700586525e-07, + "loss": -0.0006, + "num_tokens": 13383202.0, + "reward": 13.916288375854492, + "reward_std": 0.31205329298973083, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7934369444847107, + "rewards/length2tails_reward/std": 0.24310320615768433, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 272.0, + "completions/mean_terminated_length": 272.0, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.0812803409062326, + "epoch": 3.07, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08104109019041061, + "learning_rate": 6.739603885476582e-07, + "loss": -0.0019, + "num_tokens": 13391938.0, + "reward": 13.794636726379395, + "reward_std": 0.4737315773963928, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7734951376914978, + "rewards/length2tails_reward/std": 0.24882858991622925, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "entropy": 0.09241558611392975, + "epoch": 3.072, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17455969750881195, + "learning_rate": 6.727484431268831e-07, + "loss": 0.0065, + "num_tokens": 13400682.0, + "reward": 13.918707847595215, + "reward_std": 0.3102504312992096, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.817625105381012, + "rewards/length2tails_reward/std": 0.24556082487106323, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.71875, + "completions/mean_terminated_length": 272.71875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.08524061739444733, + "epoch": 3.074, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07834436744451523, + "learning_rate": 6.715370357890678e-07, + "loss": -0.0058, + "num_tokens": 13409441.0, + "reward": 13.360005378723145, + "reward_std": 2.4934613704681396, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.650642991065979, + "rewards/kidney_reward/mean": 2.484485626220703, + "rewards/kidney_reward/std": 0.8032392859458923, + "rewards/length2tails_reward/mean": 0.7930335998535156, + "rewards/length2tails_reward/std": 0.2633064091205597, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.450049877166748, + "rewards/thermo_reward/std": 1.0861150026321411, + "step": 1537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 273.6875, + "completions/mean_terminated_length": 273.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10845900978893042, + "epoch": 3.076, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1100066602230072, + "learning_rate": 6.703261685260663e-07, + "loss": -0.001, + "num_tokens": 13418231.0, + "reward": 13.392513275146484, + "reward_std": 1.8563884496688843, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.473618507385254, + "rewards/kidney_reward/std": 0.5860860347747803, + "rewards/length2tails_reward/mean": 0.7303594350814819, + "rewards/length2tails_reward/std": 0.29636603593826294, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.384673833847046, + "rewards/thermo_reward/std": 1.30971097946167, + "step": 1538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.875, + "completions/mean_terminated_length": 271.875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08378116134554148, + "epoch": 3.078, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13051924109458923, + "learning_rate": 6.691158433288464e-07, + "loss": 0.0019, + "num_tokens": 13426963.0, + "reward": 13.707208633422852, + "reward_std": 1.1767970323562622, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7400823831558228, + "rewards/length2tails_reward/std": 0.2969076633453369, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5728940963745117, + "rewards/thermo_reward/std": 1.0320227146148682, + "step": 1539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.84375, + "completions/mean_terminated_length": 270.84375, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.09347997140139341, + "epoch": 3.08, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14912846684455872, + "learning_rate": 6.679060621874833e-07, + "loss": 0.0012, + "num_tokens": 13435662.0, + "reward": 13.875182151794434, + "reward_std": 0.38105764985084534, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7812291979789734, + "rewards/length2tails_reward/std": 0.24451898038387299, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 274.90625, + "completions/mean_terminated_length": 274.90625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08737827092409134, + "epoch": 3.082, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12639953196048737, + "learning_rate": 6.666968270911584e-07, + "loss": -0.0018, + "num_tokens": 13444491.0, + "reward": 13.922403335571289, + "reward_std": 0.3234083950519562, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.854577898979187, + "rewards/length2tails_reward/std": 0.2517944872379303, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.34375, + "completions/mean_terminated_length": 272.34375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08177088387310505, + "epoch": 3.084, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11636722087860107, + "learning_rate": 6.654881400281547e-07, + "loss": -0.0042, + "num_tokens": 13453238.0, + "reward": 13.915096282958984, + "reward_std": 0.32327255606651306, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7815147638320923, + "rewards/length2tails_reward/std": 0.2819795310497284, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.84375, + "completions/mean_terminated_length": 270.84375, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "entropy": 0.09405922424048185, + "epoch": 3.086, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1301419883966446, + "learning_rate": 6.642800029858546e-07, + "loss": -0.0003, + "num_tokens": 13461937.0, + "reward": 13.520773887634277, + "reward_std": 1.081948161125183, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4921867847442627, + "rewards/kidney_reward/std": 0.6195381283760071, + "rewards/length2tails_reward/mean": 0.77314692735672, + "rewards/length2tails_reward/std": 0.26673877239227295, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4900870323181152, + "rewards/thermo_reward/std": 0.8872950673103333, + "step": 1543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.8125, + "completions/mean_terminated_length": 272.8125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08489370346069336, + "epoch": 3.088, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11313661932945251, + "learning_rate": 6.630724179507361e-07, + "loss": 0.0013, + "num_tokens": 13470699.0, + "reward": 13.793017387390137, + "reward_std": 0.4702511131763458, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7573058009147644, + "rewards/length2tails_reward/std": 0.3235591948032379, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.9375, + "completions/mean_terminated_length": 271.9375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08466823026537895, + "epoch": 3.09, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11544471979141235, + "learning_rate": 6.618653869083688e-07, + "loss": 0.0016, + "num_tokens": 13479433.0, + "reward": 12.832748413085938, + "reward_std": 4.525689125061035, + "rewards/fitness_reward/mean": 6.952455520629883, + "rewards/fitness_reward/std": 2.0028762817382812, + "rewards/kidney_reward/mean": 2.3317272663116455, + "rewards/kidney_reward/std": 1.091876745223999, + "rewards/length2tails_reward/mean": 0.7750874757766724, + "rewards/length2tails_reward/std": 0.2632858455181122, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.371056318283081, + "rewards/thermo_reward/std": 1.5435597896575928, + "step": 1545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.0625, + "completions/mean_terminated_length": 272.0625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0935161211527884, + "epoch": 3.092, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23990941047668457, + "learning_rate": 6.606589118434125e-07, + "loss": 0.0025, + "num_tokens": 13488171.0, + "reward": 13.167928695678711, + "reward_std": 2.527303695678711, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.3861141204833984, + "rewards/kidney_reward/std": 0.8551732897758484, + "rewards/length2tails_reward/mean": 0.7293937802314758, + "rewards/length2tails_reward/std": 0.3044939637184143, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.30519962310791, + "rewards/thermo_reward/std": 1.4556901454925537, + "step": 1546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.4375, + "completions/mean_terminated_length": 273.4375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.0900156507268548, + "epoch": 3.094, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12602326273918152, + "learning_rate": 6.59452994739612e-07, + "loss": 0.0002, + "num_tokens": 13496953.0, + "reward": 13.653825759887695, + "reward_std": 0.6284289956092834, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8355517387390137, + "rewards/length2tails_reward/std": 0.2260853499174118, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.50996470451355, + "rewards/thermo_reward/std": 0.5615194439888, + "step": 1547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.71875, + "completions/mean_terminated_length": 271.71875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08785140514373779, + "epoch": 3.096, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09885193407535553, + "learning_rate": 6.582476375797948e-07, + "loss": -0.0035, + "num_tokens": 13505680.0, + "reward": 13.804637908935547, + "reward_std": 0.5242769122123718, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7482430934906006, + "rewards/length2tails_reward/std": 0.25076186656951904, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 274.6875, + "completions/mean_terminated_length": 274.6875, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "entropy": 0.0945708341896534, + "epoch": 3.098, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5140098333358765, + "learning_rate": 6.570428423458686e-07, + "loss": -0.0029, + "num_tokens": 13514502.0, + "reward": 12.935726165771484, + "reward_std": 5.054202556610107, + "rewards/fitness_reward/mean": 6.980704307556152, + "rewards/fitness_reward/std": 2.1523258686065674, + "rewards/kidney_reward/mean": 2.3935017585754395, + "rewards/kidney_reward/std": 1.3179216384887695, + "rewards/length2tails_reward/mean": 0.7184333205223083, + "rewards/length2tails_reward/std": 0.29144835472106934, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.389676094055176, + "rewards/thermo_reward/std": 1.6116918325424194, + "step": 1549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.21875, + "completions/mean_terminated_length": 272.21875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09089232515543699, + "epoch": 3.1, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18398801982402802, + "learning_rate": 6.558386110188157e-07, + "loss": -0.0039, + "num_tokens": 13523245.0, + "reward": 13.343172073364258, + "reward_std": 2.6493773460388184, + "rewards/fitness_reward/mean": 7.026922702789307, + "rewards/fitness_reward/std": 1.8908731937408447, + "rewards/kidney_reward/mean": 2.510338068008423, + "rewards/kidney_reward/std": 0.5207419395446777, + "rewards/length2tails_reward/mean": 0.7628942728042603, + "rewards/length2tails_reward/std": 0.2707509398460388, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.15625, + "completions/mean_terminated_length": 273.15625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08175284508615732, + "epoch": 3.102, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0869038999080658, + "learning_rate": 6.546349455786925e-07, + "loss": -0.005, + "num_tokens": 13532018.0, + "reward": 13.078606605529785, + "reward_std": 2.4494495391845703, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.470146656036377, + "rewards/kidney_reward/std": 0.6043990254402161, + "rewards/length2tails_reward/mean": 0.8006786108016968, + "rewards/length2tails_reward/std": 0.2602311670780182, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1247167587280273, + "rewards/thermo_reward/std": 1.6955657005310059, + "step": 1551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08147422038018703, + "epoch": 3.104, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08447594940662384, + "learning_rate": 6.534318480046239e-07, + "loss": -0.0035, + "num_tokens": 13540734.0, + "reward": 13.59555435180664, + "reward_std": 1.7914360761642456, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.519392490386963, + "rewards/kidney_reward/std": 0.6057767271995544, + "rewards/length2tails_reward/mean": 0.7064756155014038, + "rewards/length2tails_reward/std": 0.27535876631736755, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.601839065551758, + "rewards/thermo_reward/std": 0.8774381279945374, + "step": 1552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.75, + "completions/mean_terminated_length": 272.75, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08858750760555267, + "epoch": 3.106, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1847735047340393, + "learning_rate": 6.522293202748017e-07, + "loss": 0.0004, + "num_tokens": 13549494.0, + "reward": 13.918962478637695, + "reward_std": 0.3105127811431885, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8201719522476196, + "rewards/length2tails_reward/std": 0.1941031664609909, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0853114714846015, + "epoch": 3.108, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17123565077781677, + "learning_rate": 6.51027364366481e-07, + "loss": 0.0022, + "num_tokens": 13558218.0, + "reward": 13.52690601348877, + "reward_std": 1.9610837697982788, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.5366315841674805, + "rewards/kidney_reward/std": 0.5082566142082214, + "rewards/length2tails_reward/mean": 0.7408733367919922, + "rewards/length2tails_reward/std": 0.27830973267555237, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5125112533569336, + "rewards/thermo_reward/std": 1.155271291732788, + "step": 1554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 268.625, + "completions/mean_terminated_length": 268.625, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.0793598871678114, + "epoch": 3.11, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09068351984024048, + "learning_rate": 6.498259822559757e-07, + "loss": 0.0019, + "num_tokens": 13566846.0, + "reward": 13.65500259399414, + "reward_std": 0.8623346090316772, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7111403942108154, + "rewards/length2tails_reward/std": 0.30966800451278687, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4962234497070312, + "rewards/thermo_reward/std": 0.8580347895622253, + "step": 1555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.96875, + "completions/mean_terminated_length": 273.96875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.08767529763281345, + "epoch": 3.112, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08908183872699738, + "learning_rate": 6.486251759186572e-07, + "loss": -0.0025, + "num_tokens": 13575645.0, + "reward": 13.632354736328125, + "reward_std": 1.063725471496582, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8941877484321594, + "rewards/length2tails_reward/std": 0.16848689317703247, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4826302528381348, + "rewards/thermo_reward/std": 0.9233596324920654, + "step": 1556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08610057085752487, + "epoch": 3.114, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14670053124427795, + "learning_rate": 6.474249473289497e-07, + "loss": 0.0057, + "num_tokens": 13584393.0, + "reward": 13.596689224243164, + "reward_std": 1.6090404987335205, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5318994522094727, + "rewards/kidney_reward/std": 0.535025417804718, + "rewards/length2tails_reward/mean": 0.807540774345398, + "rewards/length2tails_reward/std": 0.24658679962158203, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5228500366210938, + "rewards/thermo_reward/std": 1.1001607179641724, + "step": 1557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.5625, + "completions/mean_terminated_length": 272.5625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08467079140245914, + "epoch": 3.116, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15976735949516296, + "learning_rate": 6.462252984603276e-07, + "loss": 0.0062, + "num_tokens": 13593147.0, + "reward": 13.846799850463867, + "reward_std": 0.475908488035202, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7709941267967224, + "rewards/length2tails_reward/std": 0.25571998953819275, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.07919182069599628, + "epoch": 3.118, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11993545293807983, + "learning_rate": 6.45026231285312e-07, + "loss": 0.0006, + "num_tokens": 13601911.0, + "reward": 13.699793815612793, + "reward_std": 1.1293576955795288, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7839176058769226, + "rewards/length2tails_reward/std": 0.28801000118255615, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5884556770324707, + "rewards/thermo_reward/std": 0.9485320448875427, + "step": 1559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.3125, + "completions/mean_terminated_length": 273.3125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08516907598823309, + "epoch": 3.12, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10007896274328232, + "learning_rate": 6.438277477754678e-07, + "loss": -0.0011, + "num_tokens": 13610689.0, + "reward": 13.881974220275879, + "reward_std": 0.37886977195739746, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8491480350494385, + "rewards/length2tails_reward/std": 0.19460441172122955, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.375, + "completions/mean_terminated_length": 273.375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08439195808023214, + "epoch": 3.122, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11381310224533081, + "learning_rate": 6.426298499013993e-07, + "loss": 0.0018, + "num_tokens": 13619469.0, + "reward": 13.816393852233887, + "reward_std": 0.5212529897689819, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8657984137535095, + "rewards/length2tails_reward/std": 0.16835640370845795, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.59375, + "completions/mean_terminated_length": 272.59375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.07988157263025641, + "epoch": 3.124, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1909734606742859, + "learning_rate": 6.414325396327492e-07, + "loss": 0.003, + "num_tokens": 13628224.0, + "reward": 12.954696655273438, + "reward_std": 5.199808120727539, + "rewards/fitness_reward/mean": 6.977269172668457, + "rewards/fitness_reward/std": 2.171755075454712, + "rewards/kidney_reward/mean": 2.3789546489715576, + "rewards/kidney_reward/std": 1.4002131223678589, + "rewards/length2tails_reward/mean": 0.7894188165664673, + "rewards/length2tails_reward/std": 0.25925904512405396, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.419530153274536, + "rewards/thermo_reward/std": 1.6613085269927979, + "step": 1562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.28125, + "completions/mean_terminated_length": 273.28125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08113757288083434, + "epoch": 3.126, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08598976582288742, + "learning_rate": 6.402358189381933e-07, + "loss": -0.0033, + "num_tokens": 13637001.0, + "reward": 13.570723533630371, + "reward_std": 1.384864091873169, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7958351373672485, + "rewards/length2tails_reward/std": 0.32051900029182434, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.458193302154541, + "rewards/thermo_reward/std": 1.1682724952697754, + "step": 1563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 270.65625, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.08710565231740475, + "epoch": 3.128, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13959741592407227, + "learning_rate": 6.390396897854378e-07, + "loss": 0.0063, + "num_tokens": 13645694.0, + "reward": 13.752948760986328, + "reward_std": 0.5005505681037903, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.755474328994751, + "rewards/length2tails_reward/std": 0.2612459063529968, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 1564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.46875, + "completions/mean_terminated_length": 273.46875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08403254486620426, + "epoch": 3.13, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09504576772451401, + "learning_rate": 6.37844154141217e-07, + "loss": 0.0023, + "num_tokens": 13654477.0, + "reward": 13.760905265808105, + "reward_std": 0.5017654299736023, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.835037887096405, + "rewards/length2tails_reward/std": 0.23825286328792572, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 1565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.78125, + "completions/mean_terminated_length": 273.78125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08621383644640446, + "epoch": 3.132, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11715393513441086, + "learning_rate": 6.366492139712885e-07, + "loss": -0.0018, + "num_tokens": 13663270.0, + "reward": 13.812804222106934, + "reward_std": 0.5235148072242737, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8299036622047424, + "rewards/length2tails_reward/std": 0.23763789236545563, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.0625, + "completions/mean_terminated_length": 271.0625, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "entropy": 0.09282642137259245, + "epoch": 3.134, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12017302215099335, + "learning_rate": 6.354548712404313e-07, + "loss": -0.0013, + "num_tokens": 13671976.0, + "reward": 13.878923416137695, + "reward_std": 0.3813875913619995, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8186453580856323, + "rewards/length2tails_reward/std": 0.22613418102264404, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.09375, + "completions/mean_terminated_length": 272.09375, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "entropy": 0.0821957616135478, + "epoch": 3.136, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07647687196731567, + "learning_rate": 6.342611279124421e-07, + "loss": -0.0019, + "num_tokens": 13680715.0, + "reward": 13.818721771240234, + "reward_std": 0.8213350176811218, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8214808702468872, + "rewards/length2tails_reward/std": 0.21197476983070374, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.6489083766937256, + "rewards/thermo_reward/std": 0.8178472518920898, + "step": 1568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.40625, + "completions/mean_terminated_length": 271.40625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08138279803097248, + "epoch": 3.138, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08109301328659058, + "learning_rate": 6.330679859501315e-07, + "loss": -0.0057, + "num_tokens": 13689432.0, + "reward": 13.03253173828125, + "reward_std": 2.44555926322937, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.437547206878662, + "rewards/kidney_reward/std": 0.745245099067688, + "rewards/length2tails_reward/mean": 0.7165517807006836, + "rewards/length2tails_reward/std": 0.28471410274505615, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.0621438026428223, + "rewards/thermo_reward/std": 1.7939794063568115, + "step": 1569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.28125, + "completions/mean_terminated_length": 273.28125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08364815032109618, + "epoch": 3.14, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06438497453927994, + "learning_rate": 6.31875447315322e-07, + "loss": -0.0039, + "num_tokens": 13698209.0, + "reward": 13.600080490112305, + "reward_std": 1.6844980716705322, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.500974655151367, + "rewards/kidney_reward/std": 0.5715343952178955, + "rewards/length2tails_reward/mean": 0.8087210059165955, + "rewards/length2tails_reward/std": 0.2439965009689331, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.557048797607422, + "rewards/thermo_reward/std": 1.1177526712417603, + "step": 1570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "entropy": 0.07725761877372861, + "epoch": 3.142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06575439870357513, + "learning_rate": 6.306835139688438e-07, + "loss": -0.0062, + "num_tokens": 13706940.0, + "reward": 13.157386779785156, + "reward_std": 3.1938116550445557, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.4843716621398926, + "rewards/kidney_reward/std": 0.5299732089042664, + "rewards/length2tails_reward/mean": 0.8027685880661011, + "rewards/length2tails_reward/std": 0.2714056968688965, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4396843910217285, + "rewards/thermo_reward/std": 1.2497756481170654, + "step": 1571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.4375, + "completions/mean_terminated_length": 272.4375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08324752282351255, + "epoch": 3.144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11135981976985931, + "learning_rate": 6.294921878705312e-07, + "loss": -0.0039, + "num_tokens": 13715690.0, + "reward": 13.836538314819336, + "reward_std": 0.4356030225753784, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.793649435043335, + "rewards/length2tails_reward/std": 0.24881352484226227, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08308309270069003, + "epoch": 3.146, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09605570137500763, + "learning_rate": 6.283014709792214e-07, + "loss": 0.0024, + "num_tokens": 13724454.0, + "reward": 13.760598182678223, + "reward_std": 0.5016447901725769, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8319677114486694, + "rewards/length2tails_reward/std": 0.1931665539741516, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 1573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.84375, + "completions/mean_terminated_length": 272.84375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.0848759338259697, + "epoch": 3.148, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09683488309383392, + "learning_rate": 6.271113652527485e-07, + "loss": -0.0002, + "num_tokens": 13733217.0, + "reward": 13.728259086608887, + "reward_std": 0.5810441374778748, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.782167911529541, + "rewards/length2tails_reward/std": 0.2472234070301056, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 1574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.40625, + "completions/mean_terminated_length": 273.40625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08548160269856453, + "epoch": 3.15, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12159522622823715, + "learning_rate": 6.259218726479427e-07, + "loss": -0.0008, + "num_tokens": 13741998.0, + "reward": 13.842123985290527, + "reward_std": 0.43085977435112, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8495111465454102, + "rewards/length2tails_reward/std": 0.19437187910079956, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08964389516040683, + "epoch": 3.152, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12555015087127686, + "learning_rate": 6.247329951206259e-07, + "loss": 0.0004, + "num_tokens": 13750733.0, + "reward": 13.40833854675293, + "reward_std": 1.396416425704956, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7877026796340942, + "rewards/length2tails_reward/std": 0.2363130748271942, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2692625522613525, + "rewards/thermo_reward/std": 1.3035361766815186, + "step": 1576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.625, + "completions/mean_terminated_length": 272.625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08218161016702652, + "epoch": 3.154, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09898626804351807, + "learning_rate": 6.23544734625608e-07, + "loss": 0.0033, + "num_tokens": 13759489.0, + "reward": 13.878093719482422, + "reward_std": 0.3739182651042938, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8103445768356323, + "rewards/length2tails_reward/std": 0.20301468670368195, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.90625, + "completions/mean_terminated_length": 272.90625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08957744669169188, + "epoch": 3.156, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2144327163696289, + "learning_rate": 6.223570931166851e-07, + "loss": 0.0001, + "num_tokens": 13768254.0, + "reward": 13.625322341918945, + "reward_std": 1.0448532104492188, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.799221396446228, + "rewards/length2tails_reward/std": 0.24112744629383087, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4850940704345703, + "rewards/thermo_reward/std": 0.911384642124176, + "step": 1578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.46875, + "completions/mean_terminated_length": 271.46875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.08084192033857107, + "epoch": 3.158, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15178348124027252, + "learning_rate": 6.211700725466351e-07, + "loss": 0.0067, + "num_tokens": 13776973.0, + "reward": 13.87630558013916, + "reward_std": 0.3730323016643524, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7924637794494629, + "rewards/length2tails_reward/std": 0.24484948813915253, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.125, + "completions/mean_terminated_length": 272.125, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "entropy": 0.0818185918033123, + "epoch": 3.16, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09542899578809738, + "learning_rate": 6.199836748672152e-07, + "loss": 0.0066, + "num_tokens": 13785713.0, + "reward": 13.722007751464844, + "reward_std": 0.5323172211647034, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8449215292930603, + "rewards/length2tails_reward/std": 0.19587825238704681, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5498504638671875, + "rewards/thermo_reward/std": 0.5360844731330872, + "step": 1580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.53125, + "completions/mean_terminated_length": 272.53125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08284872956573963, + "epoch": 3.162, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13644354045391083, + "learning_rate": 6.187979020291583e-07, + "loss": 0.0023, + "num_tokens": 13794466.0, + "reward": 13.477725982666016, + "reward_std": 1.6386417150497437, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5256075859069824, + "rewards/kidney_reward/std": 0.5706179141998291, + "rewards/length2tails_reward/mean": 0.7800761461257935, + "rewards/length2tails_reward/std": 0.24554499983787537, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4129247665405273, + "rewards/thermo_reward/std": 1.196850061416626, + "step": 1581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.1875, + "completions/mean_terminated_length": 273.1875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08427475392818451, + "epoch": 3.164, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10350681841373444, + "learning_rate": 6.176127559821698e-07, + "loss": 0.0056, + "num_tokens": 13803240.0, + "reward": 13.82392406463623, + "reward_std": 0.5608053207397461, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8158328533172607, + "rewards/length2tails_reward/std": 0.2515362501144409, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.5, + "completions/mean_terminated_length": 273.5, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09279169328510761, + "epoch": 3.166, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1039460077881813, + "learning_rate": 6.164282386749248e-07, + "loss": 0.0003, + "num_tokens": 13812024.0, + "reward": 13.722917556762695, + "reward_std": 0.5369901657104492, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8540178537368774, + "rewards/length2tails_reward/std": 0.18012025952339172, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5498504638671875, + "rewards/thermo_reward/std": 0.5360844731330872, + "step": 1583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.1875, + "completions/mean_terminated_length": 271.1875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.09811558667570353, + "epoch": 3.168, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22617964446544647, + "learning_rate": 6.152443520550641e-07, + "loss": 0.0045, + "num_tokens": 13820734.0, + "reward": 13.63374137878418, + "reward_std": 1.0049004554748535, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8045656681060791, + "rewards/length2tails_reward/std": 0.2367853969335556, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.492978096008301, + "rewards/thermo_reward/std": 0.8734596967697144, + "step": 1584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.84375, + "completions/mean_terminated_length": 272.84375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.07768923044204712, + "epoch": 3.17, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09221001714468002, + "learning_rate": 6.14061098069192e-07, + "loss": -0.0039, + "num_tokens": 13829497.0, + "reward": 13.746610641479492, + "reward_std": 0.7933074235916138, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7430543899536133, + "rewards/length2tails_reward/std": 0.33624374866485596, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.5625, + "completions/mean_terminated_length": 272.5625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.07595672411844134, + "epoch": 3.172, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10004629194736481, + "learning_rate": 6.128784786628721e-07, + "loss": 0.0, + "num_tokens": 13838251.0, + "reward": 13.531187057495117, + "reward_std": 2.1729977130889893, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.515078544616699, + "rewards/kidney_reward/std": 0.6301804184913635, + "rewards/length2tails_reward/mean": 0.7734501361846924, + "rewards/length2tails_reward/std": 0.26072344183921814, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.535088539123535, + "rewards/thermo_reward/std": 1.2374815940856934, + "step": 1586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.5, + "completions/mean_terminated_length": 271.5, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.08676011674106121, + "epoch": 3.174, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07602910697460175, + "learning_rate": 6.116964957806252e-07, + "loss": -0.0037, + "num_tokens": 13846971.0, + "reward": 13.728689193725586, + "reward_std": 1.1042555570602417, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7970216274261475, + "rewards/length2tails_reward/std": 0.2547018229961395, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5613222122192383, + "rewards/thermo_reward/std": 1.0945703983306885, + "step": 1587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.25, + "completions/mean_terminated_length": 270.25, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.08311831764876842, + "epoch": 3.176, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07211422175168991, + "learning_rate": 6.105151513659248e-07, + "loss": -0.0072, + "num_tokens": 13855651.0, + "reward": 13.759332656860352, + "reward_std": 1.151942491531372, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8217657804489136, + "rewards/length2tails_reward/std": 0.21550104022026062, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5894906520843506, + "rewards/thermo_reward/std": 1.1446915864944458, + "step": 1588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08453467022627592, + "epoch": 3.178, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09263108670711517, + "learning_rate": 6.093344473611951e-07, + "loss": -0.0019, + "num_tokens": 13864409.0, + "reward": 13.754880905151367, + "reward_std": 0.5109540820121765, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7747988700866699, + "rewards/length2tails_reward/std": 0.24791789054870605, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 1589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.125, + "completions/mean_terminated_length": 273.125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08336858032271266, + "epoch": 3.18, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09499793499708176, + "learning_rate": 6.081543857078075e-07, + "loss": -0.0011, + "num_tokens": 13873181.0, + "reward": 13.66617202758789, + "reward_std": 0.6704643368721008, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.833747148513794, + "rewards/length2tails_reward/std": 0.21232077479362488, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5498504638671875, + "rewards/thermo_reward/std": 0.5360844731330872, + "step": 1590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.84375, + "completions/mean_terminated_length": 272.84375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08754700422286987, + "epoch": 3.182, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1134149506688118, + "learning_rate": 6.069749683460764e-07, + "loss": -0.0025, + "num_tokens": 13881944.0, + "reward": 13.244776725769043, + "reward_std": 1.9161473512649536, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.506251811981201, + "rewards/kidney_reward/std": 0.5428574681282043, + "rewards/length2tails_reward/mean": 0.8059432506561279, + "rewards/length2tails_reward/std": 0.22836993634700775, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1967456340789795, + "rewards/thermo_reward/std": 1.510373592376709, + "step": 1591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08679799642413855, + "epoch": 3.184, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13869330286979675, + "learning_rate": 6.057961972152578e-07, + "loss": 0.0036, + "num_tokens": 13890708.0, + "reward": 13.838022232055664, + "reward_std": 0.42595183849334717, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8084931373596191, + "rewards/length2tails_reward/std": 0.2337682992219925, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08560691121965647, + "epoch": 3.186, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0591062568128109, + "learning_rate": 6.046180742535441e-07, + "loss": -0.0063, + "num_tokens": 13899456.0, + "reward": 13.649131774902344, + "reward_std": 1.1960653066635132, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7516544461250305, + "rewards/length2tails_reward/std": 0.29946169257164, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.513659954071045, + "rewards/thermo_reward/std": 1.149131417274475, + "step": 1593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.96875, + "completions/mean_terminated_length": 270.96875, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 0.08243997674435377, + "epoch": 3.188, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12632687389850616, + "learning_rate": 6.034406013980628e-07, + "loss": 0.0035, + "num_tokens": 13908159.0, + "reward": 13.668336868286133, + "reward_std": 0.5609111189842224, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7070699334144592, + "rewards/length2tails_reward/std": 0.323891818523407, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5099644660949707, + "rewards/thermo_reward/std": 0.5615194439888, + "step": 1594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 273.03125, + "completions/mean_terminated_length": 273.03125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09201567247509956, + "epoch": 3.19, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6218602657318115, + "learning_rate": 6.022637805848723e-07, + "loss": 0.0218, + "num_tokens": 13916928.0, + "reward": 13.492816925048828, + "reward_std": 1.2237213850021362, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7628862857818604, + "rewards/length2tails_reward/std": 0.2679445147514343, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.356222152709961, + "rewards/thermo_reward/std": 1.125559687614441, + "step": 1595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.875, + "completions/mean_terminated_length": 271.875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08387936279177666, + "epoch": 3.192, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11393842101097107, + "learning_rate": 6.010876137489583e-07, + "loss": 0.0029, + "num_tokens": 13925660.0, + "reward": 13.91272258758545, + "reward_std": 0.31023505330085754, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7577773332595825, + "rewards/length2tails_reward/std": 0.23585514724254608, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.0865836851298809, + "epoch": 3.194, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12148909270763397, + "learning_rate": 5.999121028242322e-07, + "loss": -0.003, + "num_tokens": 13934404.0, + "reward": 13.874883651733398, + "reward_std": 0.38329583406448364, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7782378196716309, + "rewards/length2tails_reward/std": 0.25293105840682983, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.34375, + "completions/mean_terminated_length": 272.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09070562291890383, + "epoch": 3.196, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12602317333221436, + "learning_rate": 5.987372497435258e-07, + "loss": -0.0026, + "num_tokens": 13943151.0, + "reward": 13.795031547546387, + "reward_std": 0.4765709638595581, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7774432897567749, + "rewards/length2tails_reward/std": 0.265408992767334, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.8125, + "completions/mean_terminated_length": 269.8125, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.08400048688054085, + "epoch": 3.198, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.165093332529068, + "learning_rate": 5.975630564385901e-07, + "loss": 0.0061, + "num_tokens": 13951817.0, + "reward": 13.735469818115234, + "reward_std": 0.6187280416488647, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7290137410163879, + "rewards/length2tails_reward/std": 0.24236340820789337, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08165622828528285, + "epoch": 3.2, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06480145454406738, + "learning_rate": 5.96389524840091e-07, + "loss": -0.002, + "num_tokens": 13960533.0, + "reward": 13.647340774536133, + "reward_std": 1.2094513177871704, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.728061318397522, + "rewards/length2tails_reward/std": 0.2604650855064392, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4868698120117188, + "rewards/thermo_reward/std": 1.2013118267059326, + "step": 1600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.3125, + "completions/mean_terminated_length": 271.3125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08520669303834438, + "epoch": 3.202, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2728014588356018, + "learning_rate": 5.952166568776062e-07, + "loss": 0.0036, + "num_tokens": 13969247.0, + "reward": 13.399568557739258, + "reward_std": 2.2984795570373535, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.449371814727783, + "rewards/kidney_reward/std": 0.8562029600143433, + "rewards/length2tails_reward/mean": 0.7222508192062378, + "rewards/length2tails_reward/std": 0.28240150213241577, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4167864322662354, + "rewards/thermo_reward/std": 1.4644033908843994, + "step": 1601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.96875, + "completions/mean_terminated_length": 270.96875, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "entropy": 0.09271425474435091, + "epoch": 3.204, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10972724854946136, + "learning_rate": 5.940444544796222e-07, + "loss": -0.0014, + "num_tokens": 13977950.0, + "reward": 13.836832046508789, + "reward_std": 0.4348587691783905, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7965885996818542, + "rewards/length2tails_reward/std": 0.2626652121543884, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08314972370862961, + "epoch": 3.206, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12602828443050385, + "learning_rate": 5.928729195735318e-07, + "loss": -0.0027, + "num_tokens": 13986708.0, + "reward": 13.495146751403809, + "reward_std": 1.7042378187179565, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.650642991065979, + "rewards/kidney_reward/mean": 2.5109715461730957, + "rewards/kidney_reward/std": 0.5173211097717285, + "rewards/length2tails_reward/mean": 0.8184359073638916, + "rewards/length2tails_reward/std": 0.2159198671579361, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5561649799346924, + "rewards/thermo_reward/std": 0.925395131111145, + "step": 1603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.90625, + "completions/mean_terminated_length": 272.90625, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.08746057841926813, + "epoch": 3.208, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26632049679756165, + "learning_rate": 5.917020540856294e-07, + "loss": -0.0028, + "num_tokens": 13995473.0, + "reward": 12.22585678100586, + "reward_std": 6.362361431121826, + "rewards/fitness_reward/mean": 6.649763107299805, + "rewards/fitness_reward/std": 2.8078510761260986, + "rewards/kidney_reward/mean": 2.2407450675964355, + "rewards/kidney_reward/std": 1.53963041305542, + "rewards/length2tails_reward/mean": 0.8339070677757263, + "rewards/length2tails_reward/std": 0.22544747591018677, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1519570350646973, + "rewards/thermo_reward/std": 2.0436434745788574, + "step": 1604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.5625, + "completions/mean_terminated_length": 272.5625, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.0865698466077447, + "epoch": 3.21, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10460247844457626, + "learning_rate": 5.905318599411097e-07, + "loss": -0.0009, + "num_tokens": 14004227.0, + "reward": 13.839855194091797, + "reward_std": 0.4297863841056824, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8268245458602905, + "rewards/length2tails_reward/std": 0.16668696701526642, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.09375, + "completions/mean_terminated_length": 270.09375, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.07823292072862387, + "epoch": 3.212, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19168511033058167, + "learning_rate": 5.89362339064062e-07, + "loss": -0.0016, + "num_tokens": 14012902.0, + "reward": 13.867661476135254, + "reward_std": 0.3797597885131836, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7060228586196899, + "rewards/length2tails_reward/std": 0.3517718017101288, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 288.71875, + "completions/mean_terminated_length": 273.70965576171875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.10296936891973019, + "epoch": 3.214, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4714829623699188, + "learning_rate": 5.881934933774701e-07, + "loss": -0.0166, + "num_tokens": 14022173.0, + "reward": 13.924141883850098, + "reward_std": 0.3132314682006836, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8719671964645386, + "rewards/length2tails_reward/std": 0.1845170557498932, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.09375, + "completions/mean_terminated_length": 272.09375, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.08533268887549639, + "epoch": 3.216, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0771593302488327, + "learning_rate": 5.870253248032067e-07, + "loss": 0.0064, + "num_tokens": 14030912.0, + "reward": 13.835899353027344, + "reward_std": 0.4224575161933899, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7872604131698608, + "rewards/length2tails_reward/std": 0.26647889614105225, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08216171525418758, + "epoch": 3.218, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12162912636995316, + "learning_rate": 5.858578352620321e-07, + "loss": -0.0009, + "num_tokens": 14039636.0, + "reward": 13.789643287658691, + "reward_std": 0.4721164107322693, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7235633134841919, + "rewards/length2tails_reward/std": 0.2913040816783905, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.65625, + "completions/mean_terminated_length": 273.65625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10002035647630692, + "epoch": 3.22, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05761205032467842, + "learning_rate": 5.846910266735889e-07, + "loss": -0.0064, + "num_tokens": 14048425.0, + "reward": 13.744827270507812, + "reward_std": 1.2510325908660889, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8414597511291504, + "rewards/length2tails_reward/std": 0.23725688457489014, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.600375175476074, + "rewards/thermo_reward/std": 1.084394931793213, + "step": 1610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.07954447437077761, + "epoch": 3.222, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12211769074201584, + "learning_rate": 5.835249009564012e-07, + "loss": 0.0016, + "num_tokens": 14057189.0, + "reward": 13.800848960876465, + "reward_std": 0.46823522448539734, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8356137871742249, + "rewards/length2tails_reward/std": 0.18387895822525024, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.28125, + "completions/mean_terminated_length": 272.28125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.09631557948887348, + "epoch": 3.224, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3751218616962433, + "learning_rate": 5.82359460027869e-07, + "loss": -0.0072, + "num_tokens": 14065934.0, + "reward": 13.086830139160156, + "reward_std": 3.9839515686035156, + "rewards/fitness_reward/mean": 7.038168907165527, + "rewards/fitness_reward/std": 1.8272552490234375, + "rewards/kidney_reward/mean": 2.468914747238159, + "rewards/kidney_reward/std": 0.8913218379020691, + "rewards/length2tails_reward/mean": 0.7623480558395386, + "rewards/length2tails_reward/std": 0.2841353416442871, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4035110473632812, + "rewards/thermo_reward/std": 1.328399896621704, + "step": 1612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.09375, + "completions/mean_terminated_length": 272.09375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08313568867743015, + "epoch": 3.226, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12886476516723633, + "learning_rate": 5.811947058042676e-07, + "loss": 0.0039, + "num_tokens": 14074673.0, + "reward": 13.833187103271484, + "reward_std": 0.4242973327636719, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.760138988494873, + "rewards/length2tails_reward/std": 0.26653754711151123, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.34375, + "completions/mean_terminated_length": 273.34375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08375045098364353, + "epoch": 3.228, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05681309849023819, + "learning_rate": 5.800306402007427e-07, + "loss": 0.0023, + "num_tokens": 14083452.0, + "reward": 13.955066680908203, + "reward_std": 0.22359304130077362, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7823522090911865, + "rewards/length2tails_reward/std": 0.29082199931144714, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7891664505004883, + "rewards/thermo_reward/std": 0.225629061460495, + "step": 1614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.5, + "completions/mean_terminated_length": 270.5, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0816805730573833, + "epoch": 3.23, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08435562252998352, + "learning_rate": 5.788672651313078e-07, + "loss": -0.0037, + "num_tokens": 14092140.0, + "reward": 13.383923530578613, + "reward_std": 1.6765855550765991, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.6398475170135498, + "rewards/length2tails_reward/std": 0.33842137455940247, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3143515586853027, + "rewards/thermo_reward/std": 1.4448281526565552, + "step": 1615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.9375, + "completions/mean_terminated_length": 272.9375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.07523797173053026, + "epoch": 3.232, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0788172259926796, + "learning_rate": 5.777045825088403e-07, + "loss": -0.0042, + "num_tokens": 14100906.0, + "reward": 13.541112899780273, + "reward_std": 1.3472471237182617, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.5384316444396973, + "rewards/kidney_reward/std": 0.4980745017528534, + "rewards/length2tails_reward/mean": 0.7591254711151123, + "rewards/length2tails_reward/std": 0.333173930644989, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.523092269897461, + "rewards/thermo_reward/std": 0.906480073928833, + "step": 1616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.90625, + "completions/mean_terminated_length": 272.90625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.0888227610848844, + "epoch": 3.234, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13339588046073914, + "learning_rate": 5.765425942450801e-07, + "loss": 0.0032, + "num_tokens": 14109671.0, + "reward": 13.539932250976562, + "reward_std": 1.5400187969207764, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.534397602081299, + "rewards/kidney_reward/std": 0.520893931388855, + "rewards/length2tails_reward/mean": 0.8074017763137817, + "rewards/length2tails_reward/std": 0.2371595948934555, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.463608741760254, + "rewards/thermo_reward/std": 1.1467925310134888, + "step": 1617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.84375, + "completions/mean_terminated_length": 272.84375, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "entropy": 0.08746557729318738, + "epoch": 3.2359999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19486036896705627, + "learning_rate": 5.753813022506247e-07, + "loss": -0.0104, + "num_tokens": 14118434.0, + "reward": 13.883773803710938, + "reward_std": 0.3759906589984894, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.867151141166687, + "rewards/length2tails_reward/std": 0.20712190866470337, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.40625, + "completions/mean_terminated_length": 272.40625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08746451884508133, + "epoch": 3.238, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10658274590969086, + "learning_rate": 5.742207084349273e-07, + "loss": -0.0036, + "num_tokens": 14127183.0, + "reward": 13.65631103515625, + "reward_std": 0.9603863954544067, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7335498332977295, + "rewards/length2tails_reward/std": 0.2952185273170471, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.522650718688965, + "rewards/thermo_reward/std": 0.9086843132972717, + "step": 1619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.90625, + "completions/mean_terminated_length": 272.90625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08457469288259745, + "epoch": 3.24, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0973481833934784, + "learning_rate": 5.730608147062925e-07, + "loss": 0.0027, + "num_tokens": 14135948.0, + "reward": 13.033241271972656, + "reward_std": 4.528761386871338, + "rewards/fitness_reward/mean": 6.997109889984131, + "rewards/fitness_reward/std": 2.0595204830169678, + "rewards/kidney_reward/mean": 2.438572645187378, + "rewards/kidney_reward/std": 1.0629626512527466, + "rewards/length2tails_reward/mean": 0.786112904548645, + "rewards/length2tails_reward/std": 0.272849977016449, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.418948173522949, + "rewards/thermo_reward/std": 1.452711582183838, + "step": 1620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.25, + "completions/mean_terminated_length": 273.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08236623834818602, + "epoch": 3.242, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11871770024299622, + "learning_rate": 5.719016229718747e-07, + "loss": 0.0008, + "num_tokens": 14144724.0, + "reward": 13.879434585571289, + "reward_std": 0.37527257204055786, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.823756217956543, + "rewards/length2tails_reward/std": 0.23749664425849915, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.1875, + "completions/mean_terminated_length": 271.1875, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "entropy": 0.09134671650826931, + "epoch": 3.2439999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1924533098936081, + "learning_rate": 5.707431351376726e-07, + "loss": 0.0038, + "num_tokens": 14153434.0, + "reward": 13.956525802612305, + "reward_std": 0.22384217381477356, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7969448566436768, + "rewards/length2tails_reward/std": 0.2788224220275879, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7891664505004883, + "rewards/thermo_reward/std": 0.225629061460495, + "step": 1622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.0625, + "completions/mean_terminated_length": 271.0625, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "entropy": 0.09193257614970207, + "epoch": 3.246, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17157341539859772, + "learning_rate": 5.695853531085286e-07, + "loss": 0.0052, + "num_tokens": 14162140.0, + "reward": 13.958206176757812, + "reward_std": 0.22330915927886963, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8137524127960205, + "rewards/length2tails_reward/std": 0.21452507376670837, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7891664505004883, + "rewards/thermo_reward/std": 0.225629061460495, + "step": 1623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.08646232448518276, + "epoch": 3.248, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10585270076990128, + "learning_rate": 5.684282787881247e-07, + "loss": 0.0025, + "num_tokens": 14170898.0, + "reward": 13.920055389404297, + "reward_std": 0.3132040500640869, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8310965299606323, + "rewards/length2tails_reward/std": 0.1681250035762787, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.875, + "completions/mean_terminated_length": 270.875, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "entropy": 0.08505373820662498, + "epoch": 3.25, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08246946334838867, + "learning_rate": 5.672719140789785e-07, + "loss": 0.002, + "num_tokens": 14179598.0, + "reward": 13.871101379394531, + "reward_std": 0.37816736102104187, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7404244542121887, + "rewards/length2tails_reward/std": 0.3039153516292572, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.5625, + "completions/mean_terminated_length": 270.5625, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "entropy": 0.08091865479946136, + "epoch": 3.252, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07720610499382019, + "learning_rate": 5.661162608824419e-07, + "loss": -0.0026, + "num_tokens": 14188288.0, + "reward": 13.658710479736328, + "reward_std": 1.0409196615219116, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7231732606887817, + "rewards/length2tails_reward/std": 0.3009643852710724, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.526088237762451, + "rewards/thermo_reward/std": 0.8915759921073914, + "step": 1626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.90625, + "completions/mean_terminated_length": 272.90625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08896706625819206, + "epoch": 3.254, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10086528956890106, + "learning_rate": 5.649613210986953e-07, + "loss": 0.0011, + "num_tokens": 14197053.0, + "reward": 13.81051254272461, + "reward_std": 0.5248230695724487, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8069740533828735, + "rewards/length2tails_reward/std": 0.24617759883403778, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.71875, + "completions/mean_terminated_length": 272.71875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0814590915106237, + "epoch": 3.2560000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12942053377628326, + "learning_rate": 5.638070966267479e-07, + "loss": -0.0027, + "num_tokens": 14205812.0, + "reward": 13.797609329223633, + "reward_std": 0.47521042823791504, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8032171726226807, + "rewards/length2tails_reward/std": 0.24287335574626923, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.15625, + "completions/mean_terminated_length": 273.15625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08510150481015444, + "epoch": 3.258, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.226546511054039, + "learning_rate": 5.626535893644307e-07, + "loss": 0.0052, + "num_tokens": 14214585.0, + "reward": 13.579366683959961, + "reward_std": 1.3661936521530151, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7992387413978577, + "rewards/length2tails_reward/std": 0.2424638569355011, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.524005651473999, + "rewards/thermo_reward/std": 0.9019290804862976, + "step": 1629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.21875, + "completions/mean_terminated_length": 272.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09344392456114292, + "epoch": 3.26, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1050717830657959, + "learning_rate": 5.615008012083973e-07, + "loss": -0.0008, + "num_tokens": 14223328.0, + "reward": 13.75810432434082, + "reward_std": 0.5049390196800232, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8070328831672668, + "rewards/length2tails_reward/std": 0.22400563955307007, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 1630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.5, + "completions/mean_terminated_length": 270.5, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.09122634120285511, + "epoch": 3.262, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1369512975215912, + "learning_rate": 5.60348734054118e-07, + "loss": 0.001, + "num_tokens": 14232016.0, + "reward": 13.58402156829834, + "reward_std": 1.7690011262893677, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4892263412475586, + "rewards/kidney_reward/std": 0.6357679963111877, + "rewards/length2tails_reward/mean": 0.8118189573287964, + "rewards/length2tails_reward/std": 0.25912174582481384, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5524284839630127, + "rewards/thermo_reward/std": 1.1428624391555786, + "step": 1631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 273.1875, + "completions/mean_terminated_length": 273.1875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.08804465923458338, + "epoch": 3.2640000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17186035215854645, + "learning_rate": 5.591973897958781e-07, + "loss": 0.0017, + "num_tokens": 14240790.0, + "reward": 13.677754402160645, + "reward_std": 0.9940446615219116, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.848839282989502, + "rewards/length2tails_reward/std": 0.1896086186170578, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5052061080932617, + "rewards/thermo_reward/std": 0.9968248605728149, + "step": 1632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.90625, + "completions/mean_terminated_length": 272.90625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09215592127293348, + "epoch": 3.266, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17900444567203522, + "learning_rate": 5.580467703267735e-07, + "loss": -0.0005, + "num_tokens": 14249555.0, + "reward": 13.771299362182617, + "reward_std": 0.553092896938324, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8137115240097046, + "rewards/length2tails_reward/std": 0.2380651831626892, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.9375, + "completions/mean_terminated_length": 272.9375, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.09035858232527971, + "epoch": 3.268, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10600990802049637, + "learning_rate": 5.568968775387088e-07, + "loss": 0.0018, + "num_tokens": 14258321.0, + "reward": 13.639888763427734, + "reward_std": 0.5818853974342346, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8214516043663025, + "rewards/length2tails_reward/std": 0.1866902858018875, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.470078706741333, + "rewards/thermo_reward/std": 0.5830413699150085, + "step": 1634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.15625, + "completions/mean_terminated_length": 271.15625, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.08797851856797934, + "epoch": 3.27, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1364179253578186, + "learning_rate": 5.55747713322394e-07, + "loss": 0.0075, + "num_tokens": 14267030.0, + "reward": 13.79644775390625, + "reward_std": 0.4668932259082794, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7916043996810913, + "rewards/length2tails_reward/std": 0.24688823521137238, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.75, + "completions/mean_terminated_length": 272.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.07624836079776287, + "epoch": 3.2720000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07006789743900299, + "learning_rate": 5.545992795673407e-07, + "loss": -0.0048, + "num_tokens": 14275790.0, + "reward": 13.2792329788208, + "reward_std": 2.7991321086883545, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.650642991065979, + "rewards/kidney_reward/mean": 2.4431636333465576, + "rewards/kidney_reward/std": 0.8907662630081177, + "rewards/length2tails_reward/mean": 0.7920898199081421, + "rewards/length2tails_reward/std": 0.27686822414398193, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.410693645477295, + "rewards/thermo_reward/std": 1.2904707193374634, + "step": 1636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.0, + "completions/mean_terminated_length": 272.0, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09355740807950497, + "epoch": 3.274, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10698295384645462, + "learning_rate": 5.534515781618603e-07, + "loss": 0.0051, + "num_tokens": 14284526.0, + "reward": 13.92752456665039, + "reward_std": 0.37725645303726196, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7805237770080566, + "rewards/length2tails_reward/std": 0.2552531957626343, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7891664505004883, + "rewards/thermo_reward/std": 0.225629061460495, + "step": 1637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.5, + "completions/mean_terminated_length": 272.5, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08121457789093256, + "epoch": 3.276, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10736272484064102, + "learning_rate": 5.523046109930586e-07, + "loss": -0.0001, + "num_tokens": 14293278.0, + "reward": 13.421783447265625, + "reward_std": 2.360363721847534, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.484513759613037, + "rewards/kidney_reward/std": 0.8030804395675659, + "rewards/length2tails_reward/mean": 0.8002185821533203, + "rewards/length2tails_reward/std": 0.24185405671596527, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.453571319580078, + "rewards/thermo_reward/std": 1.2668566703796387, + "step": 1638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.71875, + "completions/mean_terminated_length": 272.71875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08090145420283079, + "epoch": 3.278, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19558988511562347, + "learning_rate": 5.511583799468351e-07, + "loss": 0.005, + "num_tokens": 14302037.0, + "reward": 13.693817138671875, + "reward_std": 1.0849432945251465, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8103063106536865, + "rewards/length2tails_reward/std": 0.22535519301891327, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.552481174468994, + "rewards/thermo_reward/std": 0.9444604516029358, + "step": 1639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.3125, + "completions/mean_terminated_length": 271.3125, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "entropy": 0.08695127349346876, + "epoch": 3.2800000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10942727327346802, + "learning_rate": 5.500128869078788e-07, + "loss": -0.0046, + "num_tokens": 14310751.0, + "reward": 13.836432456970215, + "reward_std": 0.44083523750305176, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7925859689712524, + "rewards/length2tails_reward/std": 0.23117829859256744, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 286.25, + "completions/mean_terminated_length": 271.1612854003906, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09664624650031328, + "epoch": 3.282, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7048646211624146, + "learning_rate": 5.488681337596653e-07, + "loss": -0.014, + "num_tokens": 14319943.0, + "reward": 12.315906524658203, + "reward_std": 6.261025905609131, + "rewards/fitness_reward/mean": 6.578765392303467, + "rewards/fitness_reward/std": 3.084444046020508, + "rewards/kidney_reward/mean": 2.2847700119018555, + "rewards/kidney_reward/std": 1.4948978424072266, + "rewards/length2tails_reward/mean": 0.711010754108429, + "rewards/length2tails_reward/std": 0.2972946763038635, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2812695503234863, + "rewards/thermo_reward/std": 1.7976088523864746, + "step": 1641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.1875, + "completions/mean_terminated_length": 270.1875, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "entropy": 0.08683570567518473, + "epoch": 3.284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09333089739084244, + "learning_rate": 5.477241223844538e-07, + "loss": -0.0044, + "num_tokens": 14328621.0, + "reward": 13.862743377685547, + "reward_std": 0.38920146226882935, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.6568487882614136, + "rewards/length2tails_reward/std": 0.3376544713973999, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 274.5, + "completions/mean_terminated_length": 274.5, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.0855266572907567, + "epoch": 3.286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11528667062520981, + "learning_rate": 5.465808546632829e-07, + "loss": -0.0029, + "num_tokens": 14337437.0, + "reward": 13.918694496154785, + "reward_std": 0.3151986300945282, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8174866437911987, + "rewards/length2tails_reward/std": 0.18618960678577423, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "entropy": 0.08168186899274588, + "epoch": 3.288, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10211341828107834, + "learning_rate": 5.454383324759693e-07, + "loss": 0.0062, + "num_tokens": 14346153.0, + "reward": 13.176010131835938, + "reward_std": 3.7224204540252686, + "rewards/fitness_reward/mean": 7.047780990600586, + "rewards/fitness_reward/std": 1.772882342338562, + "rewards/kidney_reward/mean": 2.4853415489196777, + "rewards/kidney_reward/std": 0.7983972430229187, + "rewards/length2tails_reward/mean": 0.7846165895462036, + "rewards/length2tails_reward/std": 0.28638705611228943, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.464426040649414, + "rewards/thermo_reward/std": 1.2092392444610596, + "step": 1644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 272.1875, + "completions/mean_terminated_length": 272.1875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.0881417142227292, + "epoch": 3.29, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13756471872329712, + "learning_rate": 5.442965577011038e-07, + "loss": -0.0025, + "num_tokens": 14354895.0, + "reward": 13.877664566040039, + "reward_std": 0.3820107877254486, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8060472011566162, + "rewards/length2tails_reward/std": 0.2322707325220108, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.3125, + "completions/mean_terminated_length": 273.3125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.0942190820351243, + "epoch": 3.292, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10850630700588226, + "learning_rate": 5.431555322160482e-07, + "loss": 0.0034, + "num_tokens": 14363673.0, + "reward": 13.76106071472168, + "reward_std": 0.5022327899932861, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8365955948829651, + "rewards/length2tails_reward/std": 0.1620035618543625, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897367000579834, + "rewards/thermo_reward/std": 0.5061467885971069, + "step": 1646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.9375, + "completions/mean_terminated_length": 272.9375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08697703760117292, + "epoch": 3.294, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12967857718467712, + "learning_rate": 5.420152578969325e-07, + "loss": -0.0004, + "num_tokens": 14372439.0, + "reward": 13.880912780761719, + "reward_std": 0.37692704796791077, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8385316133499146, + "rewards/length2tails_reward/std": 0.18997938930988312, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.0625, + "completions/mean_terminated_length": 273.0625, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.09083990287035704, + "epoch": 3.296, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1563207358121872, + "learning_rate": 5.408757366186507e-07, + "loss": 0.0048, + "num_tokens": 14381209.0, + "reward": 13.851879119873047, + "reward_std": 0.47710543870925903, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8217875957489014, + "rewards/length2tails_reward/std": 0.19817610085010529, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08517451956868172, + "epoch": 3.298, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2848761975765228, + "learning_rate": 5.397369702548594e-07, + "loss": 0.0025, + "num_tokens": 14389957.0, + "reward": 12.93878173828125, + "reward_std": 4.8922038078308105, + "rewards/fitness_reward/mean": 6.990458011627197, + "rewards/fitness_reward/std": 2.09714937210083, + "rewards/kidney_reward/mean": 2.399808406829834, + "rewards/kidney_reward/std": 1.1330952644348145, + "rewards/length2tails_reward/mean": 0.757195234298706, + "rewards/length2tails_reward/std": 0.28747931122779846, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.372796058654785, + "rewards/thermo_reward/std": 1.7039433717727661, + "step": 1649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.71875, + "completions/mean_terminated_length": 272.71875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0872359941713512, + "epoch": 3.3, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11530021578073502, + "learning_rate": 5.385989606779736e-07, + "loss": 0.0056, + "num_tokens": 14398716.0, + "reward": 13.840316772460938, + "reward_std": 0.42341527342796326, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8314379453659058, + "rewards/length2tails_reward/std": 0.21843001246452332, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.9375, + "completions/mean_terminated_length": 272.9375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08460887148976326, + "epoch": 3.302, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1057276651263237, + "learning_rate": 5.37461709759165e-07, + "loss": -0.0009, + "num_tokens": 14407482.0, + "reward": 13.875188827514648, + "reward_std": 0.38063186407089233, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7812926173210144, + "rewards/length2tails_reward/std": 0.25195032358169556, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 272.0625, + "completions/mean_terminated_length": 272.0625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08779798075556755, + "epoch": 3.304, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11126738786697388, + "learning_rate": 5.363252193683556e-07, + "loss": -0.0042, + "num_tokens": 14416220.0, + "reward": 13.283806800842285, + "reward_std": 1.834874153137207, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.650642991065979, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7586894035339355, + "rewards/length2tails_reward/std": 0.3202492594718933, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.290010452270508, + "rewards/thermo_reward/std": 1.2315627336502075, + "step": 1652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.21875, + "completions/mean_terminated_length": 272.21875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08191088866442442, + "epoch": 3.306, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13026760518550873, + "learning_rate": 5.351894913742192e-07, + "loss": 0.0005, + "num_tokens": 14424963.0, + "reward": 13.807825088500977, + "reward_std": 0.8251524567604065, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.653838634490967, + "rewards/kidney_reward/std": 0.15476679801940918, + "rewards/length2tails_reward/mean": 0.7754607200622559, + "rewards/length2tails_reward/std": 0.2539330720901489, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.615255355834961, + "rewards/thermo_reward/std": 0.8070117831230164, + "step": 1653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 272.78125, + "completions/mean_terminated_length": 272.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08574399258941412, + "epoch": 3.308, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1388465017080307, + "learning_rate": 5.340545276441754e-07, + "loss": 0.0011, + "num_tokens": 14433724.0, + "reward": 13.835010528564453, + "reward_std": 0.42451682686805725, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7783781290054321, + "rewards/length2tails_reward/std": 0.2693405747413635, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 271.21875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.08541551604866982, + "epoch": 3.31, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5051350593566895, + "learning_rate": 5.32920330044386e-07, + "loss": -0.0118, + "num_tokens": 14442435.0, + "reward": 13.449106216430664, + "reward_std": 2.082252025604248, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.4897396564483643, + "rewards/kidney_reward/std": 0.6329513788223267, + "rewards/length2tails_reward/mean": 0.825819730758667, + "rewards/length2tails_reward/std": 0.2356375902891159, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4731087684631348, + "rewards/thermo_reward/std": 1.1634302139282227, + "step": 1655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.0625, + "completions/mean_terminated_length": 272.0625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08114295545965433, + "epoch": 3.312, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06512371450662613, + "learning_rate": 5.317869004397544e-07, + "loss": 0.0021, + "num_tokens": 14451173.0, + "reward": 13.90968132019043, + "reward_std": 0.3100299835205078, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7273616790771484, + "rewards/length2tails_reward/std": 0.3085137903690338, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.0625, + "completions/mean_terminated_length": 273.0625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08665010053664446, + "epoch": 3.314, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11714247614145279, + "learning_rate": 5.306542406939206e-07, + "loss": 0.0042, + "num_tokens": 14459943.0, + "reward": 13.71487045288086, + "reward_std": 0.5333173871040344, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7735536098480225, + "rewards/length2tails_reward/std": 0.306517094373703, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5498504638671875, + "rewards/thermo_reward/std": 0.5360844731330872, + "step": 1657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.65625, + "completions/mean_terminated_length": 271.65625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08406441286206245, + "epoch": 3.316, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10046909004449844, + "learning_rate": 5.295223526692593e-07, + "loss": -0.0017, + "num_tokens": 14468668.0, + "reward": 13.609403610229492, + "reward_std": 0.9153671860694885, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7325645089149475, + "rewards/length2tails_reward/std": 0.27868160605430603, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4484822750091553, + "rewards/thermo_reward/std": 0.9082307815551758, + "step": 1658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.125, + "completions/mean_terminated_length": 273.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09391424618661404, + "epoch": 3.318, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07182227820158005, + "learning_rate": 5.283912382268762e-07, + "loss": -0.0013, + "num_tokens": 14477440.0, + "reward": 13.687997817993164, + "reward_std": 0.927980363368988, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8127250671386719, + "rewards/length2tails_reward/std": 0.25279492139816284, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5190603733062744, + "rewards/thermo_reward/std": 0.9266504049301147, + "step": 1659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.59375, + "completions/mean_terminated_length": 272.59375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0778304161503911, + "epoch": 3.32, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07771207392215729, + "learning_rate": 5.272608992266039e-07, + "loss": -0.0017, + "num_tokens": 14486195.0, + "reward": 13.879322052001953, + "reward_std": 0.38042446970939636, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8226305842399597, + "rewards/length2tails_reward/std": 0.21475577354431152, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.21875, + "completions/mean_terminated_length": 272.21875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08480161521583796, + "epoch": 3.322, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09371089190244675, + "learning_rate": 5.261313375270013e-07, + "loss": -0.0041, + "num_tokens": 14494938.0, + "reward": 13.952810287475586, + "reward_std": 0.23375768959522247, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7597858905792236, + "rewards/length2tails_reward/std": 0.2678978443145752, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7891664505004883, + "rewards/thermo_reward/std": 0.225629061460495, + "step": 1661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.90625, + "completions/mean_terminated_length": 271.90625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09179816581308842, + "epoch": 3.324, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14725930988788605, + "learning_rate": 5.250025549853491e-07, + "loss": 0.0017, + "num_tokens": 14503671.0, + "reward": 13.831579208374023, + "reward_std": 0.4285857081413269, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7440521717071533, + "rewards/length2tails_reward/std": 0.28048190474510193, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08306058449670672, + "epoch": 3.326, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08595037460327148, + "learning_rate": 5.238745534576461e-07, + "loss": -0.0046, + "num_tokens": 14512392.0, + "reward": 12.498080253601074, + "reward_std": 4.544852256774902, + "rewards/fitness_reward/mean": 6.621407985687256, + "rewards/fitness_reward/std": 2.7026584148406982, + "rewards/kidney_reward/mean": 2.4149489402770996, + "rewards/kidney_reward/std": 0.7349871397018433, + "rewards/length2tails_reward/mean": 0.7154628038406372, + "rewards/length2tails_reward/std": 0.32782670855522156, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.290177345275879, + "rewards/thermo_reward/std": 1.4791014194488525, + "step": 1663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.46875, + "completions/mean_terminated_length": 272.46875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.0952264815568924, + "epoch": 3.328, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09594647586345673, + "learning_rate": 5.227473347986082e-07, + "loss": 0.0028, + "num_tokens": 14521143.0, + "reward": 13.879316329956055, + "reward_std": 0.37391072511672974, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8225646018981934, + "rewards/length2tails_reward/std": 0.2126975804567337, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.07835912797600031, + "epoch": 3.33, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13977433741092682, + "learning_rate": 5.216209008616621e-07, + "loss": -0.0014, + "num_tokens": 14529874.0, + "reward": 13.880647659301758, + "reward_std": 0.4383620321750641, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7106102705001831, + "rewards/length2tails_reward/std": 0.31361716985702515, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.875, + "completions/mean_terminated_length": 271.875, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "entropy": 0.08603645162656903, + "epoch": 3.332, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.155784472823143, + "learning_rate": 5.204952534989462e-07, + "loss": -0.0121, + "num_tokens": 14538606.0, + "reward": 13.785726547241211, + "reward_std": 0.8251009583473206, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8525975346565247, + "rewards/length2tails_reward/std": 0.17560173571109772, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.612802505493164, + "rewards/thermo_reward/std": 0.8198140859603882, + "step": 1666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 270.65625, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.09190623741596937, + "epoch": 3.334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1395697444677353, + "learning_rate": 5.193703945613043e-07, + "loss": 0.0044, + "num_tokens": 14547299.0, + "reward": 13.883354187011719, + "reward_std": 0.37622812390327454, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8629554510116577, + "rewards/length2tails_reward/std": 0.20908312499523163, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.5625, + "completions/mean_terminated_length": 272.5625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08574157673865557, + "epoch": 3.336, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11695639044046402, + "learning_rate": 5.182463258982846e-07, + "loss": 0.0033, + "num_tokens": 14556053.0, + "reward": 13.877433776855469, + "reward_std": 0.3750380575656891, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8037418127059937, + "rewards/length2tails_reward/std": 0.21794305741786957, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.8125, + "completions/mean_terminated_length": 271.8125, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "entropy": 0.08719206042587757, + "epoch": 3.338, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16675855219364166, + "learning_rate": 5.171230493581358e-07, + "loss": -0.0018, + "num_tokens": 14564783.0, + "reward": 13.399309158325195, + "reward_std": 2.053853988647461, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4316983222961426, + "rewards/kidney_reward/std": 0.7683201432228088, + "rewards/length2tails_reward/mean": 0.8151653409004211, + "rewards/length2tails_reward/std": 0.2573068141937256, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4249095916748047, + "rewards/thermo_reward/std": 1.2939376831054688, + "step": 1669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.0, + "completions/max_terminated_length": 589.0, + "completions/mean_length": 282.625, + "completions/mean_terminated_length": 282.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10062659624963999, + "epoch": 3.34, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.47647494077682495, + "learning_rate": 5.160005667878033e-07, + "loss": 0.0455, + "num_tokens": 14573859.0, + "reward": 13.183435440063477, + "reward_std": 3.1175551414489746, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.511730670928955, + "rewards/kidney_reward/std": 0.5132253766059875, + "rewards/length2tails_reward/mean": 0.8262677192687988, + "rewards/length2tails_reward/std": 0.2650676369667053, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4360239505767822, + "rewards/thermo_reward/std": 1.1104899644851685, + "step": 1670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.5625, + "completions/mean_terminated_length": 273.5625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08673338778316975, + "epoch": 3.342, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1299479454755783, + "learning_rate": 5.148788800329277e-07, + "loss": 0.0016, + "num_tokens": 14582645.0, + "reward": 13.880162239074707, + "reward_std": 0.374368280172348, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8310284614562988, + "rewards/length2tails_reward/std": 0.24202223122119904, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "entropy": 0.09135918691754341, + "epoch": 3.344, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08872079104185104, + "learning_rate": 5.137579909378417e-07, + "loss": -0.004, + "num_tokens": 14591361.0, + "reward": 13.507393836975098, + "reward_std": 1.6749547719955444, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.536761999130249, + "rewards/kidney_reward/std": 0.5075190663337708, + "rewards/length2tails_reward/mean": 0.8185112476348877, + "rewards/length2tails_reward/std": 0.23466837406158447, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.427595615386963, + "rewards/thermo_reward/std": 1.2838958501815796, + "step": 1672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.34375, + "completions/mean_terminated_length": 273.34375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.085330362431705, + "epoch": 3.346, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09901421517133713, + "learning_rate": 5.126379013455655e-07, + "loss": 0.0042, + "num_tokens": 14600140.0, + "reward": 13.88277816772461, + "reward_std": 0.3741099238395691, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8571890592575073, + "rewards/length2tails_reward/std": 0.15984788537025452, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.5, + "completions/mean_terminated_length": 273.5, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08711870899423957, + "epoch": 3.348, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09154262393712997, + "learning_rate": 5.115186130978046e-07, + "loss": 0.0012, + "num_tokens": 14608924.0, + "reward": 13.653093338012695, + "reward_std": 0.6277295351028442, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8282286524772644, + "rewards/length2tails_reward/std": 0.24916499853134155, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.50996470451355, + "rewards/thermo_reward/std": 0.5615194439888, + "step": 1674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.5, + "completions/mean_terminated_length": 272.5, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0861954502761364, + "epoch": 3.35, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1118447333574295, + "learning_rate": 5.104001280349479e-07, + "loss": 0.0055, + "num_tokens": 14617676.0, + "reward": 13.753969192504883, + "reward_std": 0.4972566068172455, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.765673816204071, + "rewards/length2tails_reward/std": 0.31640881299972534, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 1675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.5625, + "completions/mean_terminated_length": 271.5625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08270327001810074, + "epoch": 3.352, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13411538302898407, + "learning_rate": 5.092824479960625e-07, + "loss": -0.0039, + "num_tokens": 14626398.0, + "reward": 13.68724250793457, + "reward_std": 1.2245506048202515, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.77997887134552, + "rewards/length2tails_reward/std": 0.2146291881799698, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.521580696105957, + "rewards/thermo_reward/std": 1.2119030952453613, + "step": 1676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.375, + "completions/mean_terminated_length": 270.375, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "entropy": 0.081757552921772, + "epoch": 3.354, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08777818828821182, + "learning_rate": 5.081655748188923e-07, + "loss": 0.0, + "num_tokens": 14635082.0, + "reward": 13.873549461364746, + "reward_std": 0.3831973671913147, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7649028301239014, + "rewards/length2tails_reward/std": 0.27656090259552, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.6875, + "completions/mean_terminated_length": 273.6875, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 0.09073897916823626, + "epoch": 3.356, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10093200951814651, + "learning_rate": 5.070495103398551e-07, + "loss": 0.0035, + "num_tokens": 14643872.0, + "reward": 13.886987686157227, + "reward_std": 0.375377357006073, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8992826342582703, + "rewards/length2tails_reward/std": 0.15805500745773315, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.8125, + "completions/mean_terminated_length": 271.8125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09156694356352091, + "epoch": 3.358, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08219363540410995, + "learning_rate": 5.059342563940383e-07, + "loss": 0.001, + "num_tokens": 14652602.0, + "reward": 13.767139434814453, + "reward_std": 0.5540488958358765, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7721138000488281, + "rewards/length2tails_reward/std": 0.23606324195861816, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 274.6875, + "completions/mean_terminated_length": 274.6875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.08526533050462604, + "epoch": 3.36, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09103113412857056, + "learning_rate": 5.048198148151968e-07, + "loss": 0.0024, + "num_tokens": 14661424.0, + "reward": 13.373743057250977, + "reward_std": 1.516675591468811, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5116631984710693, + "rewards/kidney_reward/std": 0.5135889053344727, + "rewards/length2tails_reward/mean": 0.8729467988014221, + "rewards/length2tails_reward/std": 0.22229276597499847, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3136000633239746, + "rewards/thermo_reward/std": 1.1403894424438477, + "step": 1680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.6875, + "completions/mean_terminated_length": 271.6875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08375388663262129, + "epoch": 3.362, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0875633955001831, + "learning_rate": 5.037061874357502e-07, + "loss": -0.0035, + "num_tokens": 14670150.0, + "reward": 13.83285140991211, + "reward_std": 0.4310167133808136, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.756779670715332, + "rewards/length2tails_reward/std": 0.2828678786754608, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.0, + "completions/mean_terminated_length": 272.0, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08747315965592861, + "epoch": 3.364, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12928470969200134, + "learning_rate": 5.025933760867781e-07, + "loss": 0.0077, + "num_tokens": 14678886.0, + "reward": 13.807638168334961, + "reward_std": 0.5141726732254028, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7782381772994995, + "rewards/length2tails_reward/std": 0.2559855580329895, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.59375, + "completions/mean_terminated_length": 272.59375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08404806908220053, + "epoch": 3.366, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13107116520404816, + "learning_rate": 5.014813825980196e-07, + "loss": -0.0002, + "num_tokens": 14687641.0, + "reward": 13.807857513427734, + "reward_std": 0.5213117003440857, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7804368734359741, + "rewards/length2tails_reward/std": 0.2552006244659424, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.78125, + "completions/mean_terminated_length": 271.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08485516533255577, + "epoch": 3.368, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18920296430587769, + "learning_rate": 5.003702087978685e-07, + "loss": 0.0048, + "num_tokens": 14696370.0, + "reward": 13.562747955322266, + "reward_std": 1.0341124534606934, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7378770112991333, + "rewards/length2tails_reward/std": 0.30395567417144775, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.456014394760132, + "rewards/thermo_reward/std": 0.8731143474578857, + "step": 1684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.78125, + "completions/mean_terminated_length": 271.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08026226470246911, + "epoch": 3.37, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07742941379547119, + "learning_rate": 4.992598565133709e-07, + "loss": -0.0014, + "num_tokens": 14705099.0, + "reward": 13.728206634521484, + "reward_std": 0.8616410493850708, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.709161102771759, + "rewards/length2tails_reward/std": 0.32603222131729126, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5696256160736084, + "rewards/thermo_reward/std": 0.8564317226409912, + "step": 1685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.21875, + "completions/mean_terminated_length": 273.21875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08489504549652338, + "epoch": 3.372, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09348171204328537, + "learning_rate": 4.981503275702227e-07, + "loss": 0.0057, + "num_tokens": 14713874.0, + "reward": 13.881086349487305, + "reward_std": 0.3735075891017914, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8402734398841858, + "rewards/length2tails_reward/std": 0.2117583304643631, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.6875, + "completions/mean_terminated_length": 273.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08448885660618544, + "epoch": 3.374, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12279488891363144, + "learning_rate": 4.970416237927645e-07, + "loss": 0.0055, + "num_tokens": 14722664.0, + "reward": 13.677044868469238, + "reward_std": 0.9966065883636475, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8499288558959961, + "rewards/length2tails_reward/std": 0.23315902054309845, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5317471027374268, + "rewards/thermo_reward/std": 0.8636287450790405, + "step": 1687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.53125, + "completions/mean_terminated_length": 273.53125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08651688508689404, + "epoch": 3.376, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1983906626701355, + "learning_rate": 4.959337470039815e-07, + "loss": 0.0057, + "num_tokens": 14731449.0, + "reward": 13.744522094726562, + "reward_std": 0.6199595332145691, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8195300102233887, + "rewards/length2tails_reward/std": 0.22915887832641602, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.53125, + "completions/mean_terminated_length": 270.53125, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.08888929057866335, + "epoch": 3.378, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12954142689704895, + "learning_rate": 4.948266990254988e-07, + "loss": -0.0002, + "num_tokens": 14740138.0, + "reward": 13.636672973632812, + "reward_std": 1.7633979320526123, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5204615592956543, + "rewards/kidney_reward/std": 0.599727988243103, + "rewards/length2tails_reward/mean": 0.7028523683547974, + "rewards/length2tails_reward/std": 0.2704508304595947, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.58474063873291, + "rewards/thermo_reward/std": 1.1710478067398071, + "step": 1689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.375, + "completions/mean_terminated_length": 273.375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08910947199910879, + "epoch": 3.38, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13247615098953247, + "learning_rate": 4.937204816775787e-07, + "loss": -0.0025, + "num_tokens": 14748918.0, + "reward": 13.64747428894043, + "reward_std": 1.036157488822937, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8059340119361877, + "rewards/length2tails_reward/std": 0.2467590719461441, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.506575345993042, + "rewards/thermo_reward/std": 0.9898343086242676, + "step": 1690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.0, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 282.65625, + "completions/mean_terminated_length": 282.65625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0923977354541421, + "epoch": 3.382, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16892749071121216, + "learning_rate": 4.926150967791179e-07, + "loss": -0.0091, + "num_tokens": 14757995.0, + "reward": 13.95375919342041, + "reward_std": 0.22327548265457153, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7692772746086121, + "rewards/length2tails_reward/std": 0.27421385049819946, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7891664505004883, + "rewards/thermo_reward/std": 0.225629061460495, + "step": 1691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.90625, + "completions/mean_terminated_length": 272.90625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08730402961373329, + "epoch": 3.384, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0977780744433403, + "learning_rate": 4.915105461476435e-07, + "loss": 0.0032, + "num_tokens": 14766760.0, + "reward": 13.720968246459961, + "reward_std": 0.5337844491004944, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8345231413841248, + "rewards/length2tails_reward/std": 0.20670267939567566, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5498504638671875, + "rewards/thermo_reward/std": 0.5360844731330872, + "step": 1692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.0625, + "completions/mean_terminated_length": 272.0625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08574420027434826, + "epoch": 3.386, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09038598090410233, + "learning_rate": 4.904068315993117e-07, + "loss": 0.0042, + "num_tokens": 14775498.0, + "reward": 13.764984130859375, + "reward_std": 0.5463270545005798, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7505573630332947, + "rewards/length2tails_reward/std": 0.256422758102417, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.78125, + "completions/mean_terminated_length": 272.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08899992611259222, + "epoch": 3.388, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1362181156873703, + "learning_rate": 4.893039549489039e-07, + "loss": 0.0017, + "num_tokens": 14784259.0, + "reward": 13.048135757446289, + "reward_std": 2.5231218338012695, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.452592670917511, + "rewards/kidney_reward/mean": 2.468844175338745, + "rewards/kidney_reward/std": 0.6112906336784363, + "rewards/length2tails_reward/mean": 0.8081793785095215, + "rewards/length2tails_reward/std": 0.22916291654109955, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.152308225631714, + "rewards/thermo_reward/std": 1.6434428691864014, + "step": 1694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.07493429351598024, + "epoch": 3.39, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0555286630988121, + "learning_rate": 4.882019180098236e-07, + "loss": -0.0042, + "num_tokens": 14793003.0, + "reward": 13.409624099731445, + "reward_std": 2.6447482109069824, + "rewards/fitness_reward/mean": 7.027220726013184, + "rewards/fitness_reward/std": 1.8891884088516235, + "rewards/kidney_reward/mean": 2.536163330078125, + "rewards/kidney_reward/std": 0.510905921459198, + "rewards/length2tails_reward/mean": 0.7673140168190002, + "rewards/length2tails_reward/std": 0.28843799233436584, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 274.21875, + "completions/mean_terminated_length": 274.21875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.0757355852983892, + "epoch": 3.392, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15180395543575287, + "learning_rate": 4.871007225940939e-07, + "loss": 0.006, + "num_tokens": 14801810.0, + "reward": 13.478560447692871, + "reward_std": 1.304787278175354, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8556383848190308, + "rewards/length2tails_reward/std": 0.20991253852844238, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3600502014160156, + "rewards/thermo_reward/std": 1.1117326021194458, + "step": 1696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.15625, + "completions/mean_terminated_length": 272.15625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0866955453529954, + "epoch": 3.394, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11245302855968475, + "learning_rate": 4.860003705123538e-07, + "loss": 0.009, + "num_tokens": 14810551.0, + "reward": 13.753739356994629, + "reward_std": 0.4965442717075348, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.76337730884552, + "rewards/length2tails_reward/std": 0.26430433988571167, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897367000579834, + "rewards/thermo_reward/std": 0.5061467885971069, + "step": 1697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.46875, + "completions/mean_terminated_length": 272.46875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08178166439756751, + "epoch": 3.396, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11046211421489716, + "learning_rate": 4.849008635738553e-07, + "loss": -0.0003, + "num_tokens": 14819302.0, + "reward": 13.504169464111328, + "reward_std": 2.3498494625091553, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.4508585929870605, + "rewards/kidney_reward/std": 0.9934619069099426, + "rewards/length2tails_reward/mean": 0.8170171976089478, + "rewards/length2tails_reward/std": 0.21700961887836456, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5104241371154785, + "rewards/thermo_reward/std": 1.3729063272476196, + "step": 1698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 273.9375, + "completions/mean_terminated_length": 273.9375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09255535062402487, + "epoch": 3.398, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11002900451421738, + "learning_rate": 4.838022035864618e-07, + "loss": -0.0098, + "num_tokens": 14828100.0, + "reward": 13.837339401245117, + "reward_std": 0.4396858513355255, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.801662802696228, + "rewards/length2tails_reward/std": 0.2261502742767334, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.96875, + "completions/mean_terminated_length": 272.96875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.07731029391288757, + "epoch": 3.4, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09554555267095566, + "learning_rate": 4.827043923566434e-07, + "loss": -0.0059, + "num_tokens": 14836867.0, + "reward": 13.464859008789062, + "reward_std": 1.9004313945770264, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.528146982192993, + "rewards/kidney_reward/std": 0.5562539100646973, + "rewards/length2tails_reward/mean": 0.803817868232727, + "rewards/length2tails_reward/std": 0.24521175026893616, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.395145893096924, + "rewards/thermo_reward/std": 1.4224570989608765, + "step": 1700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.5625, + "completions/mean_terminated_length": 272.5625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08424821449443698, + "epoch": 3.402, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1601286232471466, + "learning_rate": 4.816074316894749e-07, + "loss": 0.0006, + "num_tokens": 14845621.0, + "reward": 13.775213241577148, + "reward_std": 0.8285742998123169, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7672507166862488, + "rewards/length2tails_reward/std": 0.2551310062408447, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.610823631286621, + "rewards/thermo_reward/std": 0.8301683068275452, + "step": 1701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.28125, + "completions/mean_terminated_length": 272.28125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.07866405416280031, + "epoch": 3.404, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11888407170772552, + "learning_rate": 4.805113233886331e-07, + "loss": 0.0013, + "num_tokens": 14854366.0, + "reward": 13.874348640441895, + "reward_std": 0.37398761510849, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7728902101516724, + "rewards/length2tails_reward/std": 0.2928953468799591, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.25, + "completions/mean_terminated_length": 270.25, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "entropy": 0.08661049697548151, + "epoch": 3.406, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18512208759784698, + "learning_rate": 4.794160692563917e-07, + "loss": -0.0434, + "num_tokens": 14863046.0, + "reward": 13.088939666748047, + "reward_std": 4.899957656860352, + "rewards/fitness_reward/mean": 6.978641986846924, + "rewards/fitness_reward/std": 2.1639907360076904, + "rewards/kidney_reward/mean": 2.4049041271209717, + "rewards/kidney_reward/std": 1.2534205913543701, + "rewards/length2tails_reward/mean": 0.7930488586425781, + "rewards/length2tails_reward/std": 0.23190155625343323, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5260891914367676, + "rewards/thermo_reward/std": 1.4979829788208008, + "step": 1703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 272.34375, + "completions/mean_terminated_length": 272.34375, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "entropy": 0.08553109876811504, + "epoch": 3.408, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10315986722707748, + "learning_rate": 4.783216710936212e-07, + "loss": 0.0052, + "num_tokens": 14871793.0, + "reward": 13.883841514587402, + "reward_std": 0.374860554933548, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8678221106529236, + "rewards/length2tails_reward/std": 0.1376638561487198, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.71875, + "completions/mean_terminated_length": 271.71875, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "entropy": 0.07992964796721935, + "epoch": 3.41, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10810313373804092, + "learning_rate": 4.772281306997848e-07, + "loss": 0.0039, + "num_tokens": 14880520.0, + "reward": 13.925355911254883, + "reward_std": 0.312097430229187, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8841127157211304, + "rewards/length2tails_reward/std": 0.12169551104307175, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 286.59375, + "completions/mean_terminated_length": 271.51611328125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09155255556106567, + "epoch": 3.412, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24644652009010315, + "learning_rate": 4.761354498729344e-07, + "loss": -0.0228, + "num_tokens": 14889723.0, + "reward": 13.73485279083252, + "reward_std": 0.8732155561447144, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7760403156280518, + "rewards/length2tails_reward/std": 0.25661709904670715, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5695838928222656, + "rewards/thermo_reward/std": 0.85664302110672, + "step": 1706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.59375, + "completions/mean_terminated_length": 269.59375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.08220897056162357, + "epoch": 3.414, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1083146333694458, + "learning_rate": 4.7504363040970987e-07, + "loss": -0.0039, + "num_tokens": 14898382.0, + "reward": 13.708334922790527, + "reward_std": 0.8958057165145874, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.770789384841919, + "rewards/length2tails_reward/std": 0.2582211494445801, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.570950508117676, + "rewards/thermo_reward/std": 0.8497097492218018, + "step": 1707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.5625, + "completions/mean_terminated_length": 271.5625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08558735437691212, + "epoch": 3.416, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08821485936641693, + "learning_rate": 4.7395267410533304e-07, + "loss": 0.0004, + "num_tokens": 14907104.0, + "reward": 13.673770904541016, + "reward_std": 0.954501211643219, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7165445685386658, + "rewards/length2tails_reward/std": 0.25880908966064453, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.514451503753662, + "rewards/thermo_reward/std": 0.949852705001831, + "step": 1708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.03125, + "completions/mean_terminated_length": 272.03125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.07648206362500787, + "epoch": 3.418, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13279765844345093, + "learning_rate": 4.728625827536079e-07, + "loss": 0.0041, + "num_tokens": 14915841.0, + "reward": 13.487560272216797, + "reward_std": 2.0662591457366943, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.486978769302368, + "rewards/kidney_reward/std": 0.6481077075004578, + "rewards/length2tails_reward/mean": 0.7673639059066772, + "rewards/length2tails_reward/std": 0.23985138535499573, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5201685428619385, + "rewards/thermo_reward/std": 1.2155030965805054, + "step": 1709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.0, + "completions/max_terminated_length": 532.0, + "completions/mean_length": 280.78125, + "completions/mean_terminated_length": 280.78125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08176046423614025, + "epoch": 3.42, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13497784733772278, + "learning_rate": 4.7177335814691564e-07, + "loss": -0.0096, + "num_tokens": 14924858.0, + "reward": 13.632104873657227, + "reward_std": 1.6099051237106323, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5389091968536377, + "rewards/kidney_reward/std": 0.4953727722167969, + "rewards/length2tails_reward/mean": 0.7525918483734131, + "rewards/length2tails_reward/std": 0.2809031903743744, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5567517280578613, + "rewards/thermo_reward/std": 1.1193660497665405, + "step": 1710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.25, + "completions/mean_terminated_length": 273.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08430183865129948, + "epoch": 3.422, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10553085058927536, + "learning_rate": 4.7068500207621255e-07, + "loss": 0.0019, + "num_tokens": 14933634.0, + "reward": 13.798028945922852, + "reward_std": 0.46553653478622437, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8074179887771606, + "rewards/length2tails_reward/std": 0.2968822121620178, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.78125, + "completions/mean_terminated_length": 272.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08506747148931026, + "epoch": 3.424, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.118733249604702, + "learning_rate": 4.6959751633102673e-07, + "loss": 0.0003, + "num_tokens": 14942395.0, + "reward": 13.716915130615234, + "reward_std": 0.5371593832969666, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.79399573802948, + "rewards/length2tails_reward/std": 0.2721828520298004, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5498507022857666, + "rewards/thermo_reward/std": 0.5360844731330872, + "step": 1712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.3125, + "completions/mean_terminated_length": 273.3125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08996152225881815, + "epoch": 3.426, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10672859847545624, + "learning_rate": 4.685109026994556e-07, + "loss": 0.0026, + "num_tokens": 14951173.0, + "reward": 13.839287757873535, + "reward_std": 0.43059033155441284, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8211460113525391, + "rewards/length2tails_reward/std": 0.24959631264209747, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.4375, + "completions/mean_terminated_length": 272.4375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.0840189615264535, + "epoch": 3.428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09118340164422989, + "learning_rate": 4.674251629681615e-07, + "loss": 0.0038, + "num_tokens": 14959923.0, + "reward": 13.689261436462402, + "reward_std": 0.6019656658172607, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.791059136390686, + "rewards/length2tails_reward/std": 0.24709247052669525, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5498507022857666, + "rewards/thermo_reward/std": 0.5360844731330872, + "step": 1714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 273.65625, + "completions/mean_terminated_length": 273.65625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08755708485841751, + "epoch": 3.43, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11407990753650665, + "learning_rate": 4.663402989223709e-07, + "loss": -0.0055, + "num_tokens": 14968712.0, + "reward": 13.876548767089844, + "reward_std": 0.38363367319107056, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7948861718177795, + "rewards/length2tails_reward/std": 0.21179290115833282, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.25, + "completions/mean_terminated_length": 271.25, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 0.08625697903335094, + "epoch": 3.432, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07827986031770706, + "learning_rate": 4.652563123458703e-07, + "loss": -0.0005, + "num_tokens": 14977424.0, + "reward": 13.826070785522461, + "reward_std": 0.944733202457428, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7625695466995239, + "rewards/length2tails_reward/std": 0.26302239298820496, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.6895079612731934, + "rewards/thermo_reward/std": 0.7893825769424438, + "step": 1716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08533341623842716, + "epoch": 3.434, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16518720984458923, + "learning_rate": 4.641732050210031e-07, + "loss": 0.0011, + "num_tokens": 14986168.0, + "reward": 12.971258163452148, + "reward_std": 4.007254600524902, + "rewards/fitness_reward/mean": 7.052707672119141, + "rewards/fitness_reward/std": 1.745011329650879, + "rewards/kidney_reward/mean": 2.399848461151123, + "rewards/kidney_reward/std": 0.9084376096725464, + "rewards/length2tails_reward/mean": 0.7630383968353271, + "rewards/length2tails_reward/std": 0.2822224199771881, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.342397928237915, + "rewards/thermo_reward/std": 1.6145765781402588, + "step": 1717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.78125, + "completions/mean_terminated_length": 271.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08906451426446438, + "epoch": 3.436, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11727608740329742, + "learning_rate": 4.6309097872866766e-07, + "loss": -0.0033, + "num_tokens": 14994897.0, + "reward": 13.910297393798828, + "reward_std": 0.31524109840393066, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7335153222084045, + "rewards/length2tails_reward/std": 0.2708199620246887, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.34375, + "completions/mean_terminated_length": 273.34375, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.0859810272231698, + "epoch": 3.438, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08078237622976303, + "learning_rate": 4.6200963524831284e-07, + "loss": 0.0031, + "num_tokens": 15003676.0, + "reward": 13.933954238891602, + "reward_std": 0.3779996335506439, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.844823956489563, + "rewards/length2tails_reward/std": 0.1683105230331421, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7891664505004883, + "rewards/thermo_reward/std": 0.225629061460495, + "step": 1719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.65625, + "completions/mean_terminated_length": 271.65625, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.0825602188706398, + "epoch": 3.44, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0974683165550232, + "learning_rate": 4.609291763579357e-07, + "loss": -0.0025, + "num_tokens": 15012401.0, + "reward": 13.808998107910156, + "reward_std": 0.5261107683181763, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7918394207954407, + "rewards/length2tails_reward/std": 0.2661225199699402, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.75, + "completions/mean_terminated_length": 272.75, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08152022119611502, + "epoch": 3.442, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06283272057771683, + "learning_rate": 4.5984960383408e-07, + "loss": -0.0053, + "num_tokens": 15021161.0, + "reward": 13.772812843322754, + "reward_std": 1.2733838558197021, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.78622967004776, + "rewards/length2tails_reward/std": 0.2837279736995697, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.6065256595611572, + "rewards/thermo_reward/std": 1.2588016986846924, + "step": 1721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.40625, + "completions/mean_terminated_length": 271.40625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08558083605021238, + "epoch": 3.444, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11926688253879547, + "learning_rate": 4.5877091945183143e-07, + "loss": -0.0023, + "num_tokens": 15029878.0, + "reward": 13.610316276550293, + "reward_std": 1.2155641317367554, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7381070852279663, + "rewards/length2tails_reward/std": 0.24422591924667358, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.448840856552124, + "rewards/thermo_reward/std": 1.20847487449646, + "step": 1722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.15625, + "completions/mean_terminated_length": 272.15625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08817206136882305, + "epoch": 3.446, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07041650265455246, + "learning_rate": 4.576931249848155e-07, + "loss": -0.0032, + "num_tokens": 15038619.0, + "reward": 13.744803428649902, + "reward_std": 0.8200621008872986, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.536046028137207, + "rewards/kidney_reward/std": 0.5115687847137451, + "rewards/length2tails_reward/mean": 0.7806336879730225, + "rewards/length2tails_reward/std": 0.2577957212924957, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 271.75, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "entropy": 0.08811759203672409, + "epoch": 3.448, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3796294927597046, + "learning_rate": 4.5661622220519455e-07, + "loss": -0.0211, + "num_tokens": 15047347.0, + "reward": 13.921072006225586, + "reward_std": 0.3113763928413391, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8412730693817139, + "rewards/length2tails_reward/std": 0.22260740399360657, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.0625, + "completions/mean_terminated_length": 273.0625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08379036001861095, + "epoch": 3.45, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07183808833360672, + "learning_rate": 4.555402128836642e-07, + "loss": -0.0059, + "num_tokens": 15056117.0, + "reward": 13.833101272583008, + "reward_std": 0.7487102746963501, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8102487325668335, + "rewards/length2tails_reward/std": 0.24392889440059662, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.3125, + "completions/mean_terminated_length": 271.3125, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.08676104340702295, + "epoch": 3.452, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07923120260238647, + "learning_rate": 4.544650987894514e-07, + "loss": -0.0054, + "num_tokens": 15064831.0, + "reward": 13.915239334106445, + "reward_std": 0.32995709776878357, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.782934844493866, + "rewards/length2tails_reward/std": 0.2695567309856415, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 274.0625, + "completions/mean_terminated_length": 274.0625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.10370921809226274, + "epoch": 3.454, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0856776013970375, + "learning_rate": 4.533908816903115e-07, + "loss": -0.0002, + "num_tokens": 15073633.0, + "reward": 13.965975761413574, + "reward_std": 0.22475717961788177, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8914409279823303, + "rewards/length2tails_reward/std": 0.1631007343530655, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7891664505004883, + "rewards/thermo_reward/std": 0.225629061460495, + "step": 1727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.09375, + "completions/mean_terminated_length": 272.09375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08512784168124199, + "epoch": 3.456, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07340458035469055, + "learning_rate": 4.5231756335252433e-07, + "loss": -0.0011, + "num_tokens": 15082372.0, + "reward": 13.631868362426758, + "reward_std": 1.0840603113174438, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7518198490142822, + "rewards/length2tails_reward/std": 0.27484434843063354, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4963815212249756, + "rewards/thermo_reward/std": 1.0421310663223267, + "step": 1728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.71875, + "completions/mean_terminated_length": 272.71875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.07721906714141369, + "epoch": 3.458, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20975400507450104, + "learning_rate": 4.512451455408929e-07, + "loss": -0.0056, + "num_tokens": 15091131.0, + "reward": 12.620357513427734, + "reward_std": 3.4667251110076904, + "rewards/fitness_reward/mean": 6.938035011291504, + "rewards/fitness_reward/std": 1.840762972831726, + "rewards/kidney_reward/mean": 2.36350679397583, + "rewards/kidney_reward/std": 0.737473726272583, + "rewards/length2tails_reward/mean": 0.7722162008285522, + "rewards/length2tails_reward/std": 0.3076375722885132, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.141594171524048, + "rewards/thermo_reward/std": 1.4435781240463257, + "step": 1729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 272.03125, + "completions/mean_terminated_length": 272.03125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0899735875427723, + "epoch": 3.46, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16601188480854034, + "learning_rate": 4.5017363001873774e-07, + "loss": 0.0053, + "num_tokens": 15099868.0, + "reward": 13.60742473602295, + "reward_std": 0.6418152451515198, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7704048156738281, + "rewards/length2tails_reward/std": 0.24701431393623352, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.470078945159912, + "rewards/thermo_reward/std": 0.5830413699150085, + "step": 1730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 275.71875, + "completions/mean_terminated_length": 275.71875, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.09080104809254408, + "epoch": 3.462, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16605244576931, + "learning_rate": 4.4910301854789755e-07, + "loss": -0.0038, + "num_tokens": 15108723.0, + "reward": 13.880746841430664, + "reward_std": 0.3769935965538025, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8368751406669617, + "rewards/length2tails_reward/std": 0.21261143684387207, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.9375, + "completions/mean_terminated_length": 272.9375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.07802405953407288, + "epoch": 3.464, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06647925823926926, + "learning_rate": 4.480333128887237e-07, + "loss": -0.0054, + "num_tokens": 15117489.0, + "reward": 13.134675979614258, + "reward_std": 2.7280941009521484, + "rewards/fitness_reward/mean": 7.047297477722168, + "rewards/fitness_reward/std": 1.7756171226501465, + "rewards/kidney_reward/mean": 2.478243827819824, + "rewards/kidney_reward/std": 0.5618298053741455, + "rewards/length2tails_reward/mean": 0.7867090702056885, + "rewards/length2tails_reward/std": 0.27397066354751587, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.430464267730713, + "rewards/thermo_reward/std": 0.9946144819259644, + "step": 1732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.21875, + "completions/mean_terminated_length": 272.21875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.0766105898655951, + "epoch": 3.466, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07114367187023163, + "learning_rate": 4.4696451480007846e-07, + "loss": 0.0035, + "num_tokens": 15126232.0, + "reward": 13.954242706298828, + "reward_std": 0.22321708500385284, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7741064429283142, + "rewards/length2tails_reward/std": 0.2730728089809418, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7891664505004883, + "rewards/thermo_reward/std": 0.225629061460495, + "step": 1733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.8125, + "completions/mean_terminated_length": 270.8125, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 0.09087709616869688, + "epoch": 3.468, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11216463148593903, + "learning_rate": 4.458966260393322e-07, + "loss": -0.0007, + "num_tokens": 15134930.0, + "reward": 13.91307544708252, + "reward_std": 0.3138239085674286, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7613010406494141, + "rewards/length2tails_reward/std": 0.2676401138305664, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.78125, + "completions/mean_terminated_length": 272.78125, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "entropy": 0.08705608453601599, + "epoch": 3.4699999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1364695131778717, + "learning_rate": 4.448296483623587e-07, + "loss": 0.0003, + "num_tokens": 15143691.0, + "reward": 13.93537425994873, + "reward_std": 0.37957563996315, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8590216636657715, + "rewards/length2tails_reward/std": 0.20525671541690826, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7891664505004883, + "rewards/thermo_reward/std": 0.225629061460495, + "step": 1735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.5625, + "completions/mean_terminated_length": 271.5625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.07921055890619755, + "epoch": 3.472, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17655280232429504, + "learning_rate": 4.4376358352353526e-07, + "loss": -0.0037, + "num_tokens": 15152413.0, + "reward": 13.572860717773438, + "reward_std": 1.2297868728637695, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7480850219726562, + "rewards/length2tails_reward/std": 0.2410997599363327, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.495255708694458, + "rewards/thermo_reward/std": 1.0479415655136108, + "step": 1736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.40625, + "completions/mean_terminated_length": 273.40625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08936100918799639, + "epoch": 3.474, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09362028539180756, + "learning_rate": 4.4269843327573743e-07, + "loss": -0.0006, + "num_tokens": 15161194.0, + "reward": 12.85144329071045, + "reward_std": 4.335888385772705, + "rewards/fitness_reward/mean": 7.02105712890625, + "rewards/fitness_reward/std": 1.924055814743042, + "rewards/kidney_reward/mean": 2.4442200660705566, + "rewards/kidney_reward/std": 0.8848810791969299, + "rewards/length2tails_reward/mean": 0.8391945362091064, + "rewards/length2tails_reward/std": 0.20115543901920319, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2022478580474854, + "rewards/thermo_reward/std": 1.7934486865997314, + "step": 1737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 636.0, + "completions/max_terminated_length": 636.0, + "completions/mean_length": 283.625, + "completions/mean_terminated_length": 283.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08751403260976076, + "epoch": 3.476, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29367560148239136, + "learning_rate": 4.416341993703373e-07, + "loss": -0.0084, + "num_tokens": 15170302.0, + "reward": 13.098956108093262, + "reward_std": 4.083829879760742, + "rewards/fitness_reward/mean": 7.032780170440674, + "rewards/fitness_reward/std": 1.8577386140823364, + "rewards/kidney_reward/mean": 2.4100849628448486, + "rewards/kidney_reward/std": 0.9298133254051208, + "rewards/length2tails_reward/mean": 0.7794876098632812, + "rewards/length2tails_reward/std": 0.2651849091053009, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4781417846679688, + "rewards/thermo_reward/std": 1.340535044670105, + "step": 1738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.59375, + "completions/mean_terminated_length": 271.59375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09130069054663181, + "epoch": 3.4779999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14665277302265167, + "learning_rate": 4.405708835572005e-07, + "loss": 0.001, + "num_tokens": 15179025.0, + "reward": 13.806692123413086, + "reward_std": 0.5210264325141907, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.768774151802063, + "rewards/length2tails_reward/std": 0.26261356472969055, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.53125, + "completions/mean_terminated_length": 272.53125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08416412118822336, + "epoch": 3.48, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1830289661884308, + "learning_rate": 4.39508487584682e-07, + "loss": 0.0001, + "num_tokens": 15187778.0, + "reward": 13.80591869354248, + "reward_std": 0.5232962369918823, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7610487937927246, + "rewards/length2tails_reward/std": 0.2874740958213806, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 271.4375, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "entropy": 0.08090137969702482, + "epoch": 3.482, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10863588005304337, + "learning_rate": 4.384470131996252e-07, + "loss": 0.0004, + "num_tokens": 15196496.0, + "reward": 13.718465805053711, + "reward_std": 0.5391813516616821, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8095073699951172, + "rewards/length2tails_reward/std": 0.2026718258857727, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5498507022857666, + "rewards/thermo_reward/std": 0.5360844731330872, + "step": 1741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.53125, + "completions/mean_terminated_length": 270.53125, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "entropy": 0.085418657399714, + "epoch": 3.484, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1338280588388443, + "learning_rate": 4.3738646214735864e-07, + "loss": -0.0165, + "num_tokens": 15205185.0, + "reward": 13.87672233581543, + "reward_std": 0.3799758553504944, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7966322898864746, + "rewards/length2tails_reward/std": 0.20604509115219116, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.84375, + "completions/mean_terminated_length": 272.84375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08169888611882925, + "epoch": 3.4859999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07724236696958542, + "learning_rate": 4.363268361716912e-07, + "loss": -0.0017, + "num_tokens": 15213948.0, + "reward": 13.66454792022705, + "reward_std": 0.8659971356391907, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8111091256141663, + "rewards/length2tails_reward/std": 0.21726176142692566, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.495771646499634, + "rewards/thermo_reward/std": 0.8601759076118469, + "step": 1743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.5625, + "completions/mean_terminated_length": 272.5625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08383139874786139, + "epoch": 3.488, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11576797813177109, + "learning_rate": 4.3526813701491183e-07, + "loss": -0.0033, + "num_tokens": 15222702.0, + "reward": 13.91389274597168, + "reward_std": 0.32040977478027344, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7694723606109619, + "rewards/length2tails_reward/std": 0.28133609890937805, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 274.15625, + "completions/mean_terminated_length": 274.15625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.0834615072235465, + "epoch": 3.49, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18630819022655487, + "learning_rate": 4.3421036641778553e-07, + "loss": 0.0013, + "num_tokens": 15231507.0, + "reward": 13.205314636230469, + "reward_std": 2.668398380279541, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.3939995765686035, + "rewards/kidney_reward/std": 0.9162217378616333, + "rewards/length2tails_reward/mean": 0.8655220866203308, + "rewards/length2tails_reward/std": 0.20133092999458313, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.321087598800659, + "rewards/thermo_reward/std": 1.5478066205978394, + "step": 1745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08705971017479897, + "epoch": 3.492, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08833774924278259, + "learning_rate": 4.3315352611955035e-07, + "loss": 0.0012, + "num_tokens": 15240231.0, + "reward": 13.732002258300781, + "reward_std": 1.0502101182937622, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7496770024299622, + "rewards/length2tails_reward/std": 0.27922266721725464, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.596729278564453, + "rewards/thermo_reward/std": 0.9044937491416931, + "step": 1746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.46875, + "completions/mean_terminated_length": 272.46875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08123445883393288, + "epoch": 3.4939999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12158084660768509, + "learning_rate": 4.320976178579141e-07, + "loss": 0.0005, + "num_tokens": 15248982.0, + "reward": 13.316404342651367, + "reward_std": 1.9671152830123901, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5067176818847656, + "rewards/kidney_reward/std": 0.5403326153755188, + "rewards/length2tails_reward/mean": 0.7520942687988281, + "rewards/length2tails_reward/std": 0.2965957820415497, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.273292303085327, + "rewards/thermo_reward/std": 1.465672254562378, + "step": 1747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 273.0625, + "completions/mean_terminated_length": 273.0625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08086736872792244, + "epoch": 3.496, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09447924047708511, + "learning_rate": 4.310426433690528e-07, + "loss": 0.0068, + "num_tokens": 15257752.0, + "reward": 13.811731338500977, + "reward_std": 0.5163472890853882, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8191708326339722, + "rewards/length2tails_reward/std": 0.21201351284980774, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.34375, + "completions/mean_terminated_length": 273.34375, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.07997060008347034, + "epoch": 3.498, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09858963638544083, + "learning_rate": 4.299886043876071e-07, + "loss": -0.0017, + "num_tokens": 15266531.0, + "reward": 13.764122009277344, + "reward_std": 0.50864577293396, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8671994209289551, + "rewards/length2tails_reward/std": 0.12816838920116425, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 1749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.59375, + "completions/mean_terminated_length": 269.59375, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "entropy": 0.08890233561396599, + "epoch": 3.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16905415058135986, + "learning_rate": 4.289355026466791e-07, + "loss": 0.0071, + "num_tokens": 15275190.0, + "reward": 13.854276657104492, + "reward_std": 0.4811854362487793, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8457736968994141, + "rewards/length2tails_reward/std": 0.1967909038066864, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0906071225181222, + "epoch": 3.502, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09274038672447205, + "learning_rate": 4.278833398778305e-07, + "loss": -0.006, + "num_tokens": 15283938.0, + "reward": 13.834604263305664, + "reward_std": 0.4413672685623169, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7743013501167297, + "rewards/length2tails_reward/std": 0.2758329510688782, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 282.21875, + "completions/mean_terminated_length": 282.21875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08395624533295631, + "epoch": 3.504, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4823765456676483, + "learning_rate": 4.2683211781107785e-07, + "loss": -0.0172, + "num_tokens": 15293001.0, + "reward": 13.870827674865723, + "reward_std": 0.3858233094215393, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7376867532730103, + "rewards/length2tails_reward/std": 0.2750190496444702, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.03125, + "completions/mean_terminated_length": 271.03125, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.08187512308359146, + "epoch": 3.5060000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11266060173511505, + "learning_rate": 4.257818381748921e-07, + "loss": -0.0006, + "num_tokens": 15301706.0, + "reward": 13.875701904296875, + "reward_std": 0.38500985503196716, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7864289283752441, + "rewards/length2tails_reward/std": 0.23578976094722748, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.5625, + "completions/mean_terminated_length": 270.5625, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.08811250422149897, + "epoch": 3.508, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08882928639650345, + "learning_rate": 4.247325026961941e-07, + "loss": 0.0026, + "num_tokens": 15310396.0, + "reward": 13.953454971313477, + "reward_std": 0.22397710382938385, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7662337422370911, + "rewards/length2tails_reward/std": 0.25318098068237305, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7891664505004883, + "rewards/thermo_reward/std": 0.225629061460495, + "step": 1754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.08091798983514309, + "epoch": 3.51, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07322859764099121, + "learning_rate": 4.2368411310035237e-07, + "loss": -0.004, + "num_tokens": 15319112.0, + "reward": 13.878302574157715, + "reward_std": 0.3881266117095947, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8124330043792725, + "rewards/length2tails_reward/std": 0.23422132432460785, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.0625, + "completions/mean_terminated_length": 270.0625, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.08312960062175989, + "epoch": 3.512, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20515073835849762, + "learning_rate": 4.226366711111807e-07, + "loss": -0.0256, + "num_tokens": 15327786.0, + "reward": 13.440038681030273, + "reward_std": 2.7264461517333984, + "rewards/fitness_reward/mean": 7.043000221252441, + "rewards/fitness_reward/std": 1.7999252080917358, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.849888026714325, + "rewards/length2tails_reward/std": 0.1674598753452301, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.612929105758667, + "rewards/thermo_reward/std": 0.8191527128219604, + "step": 1756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.21875, + "completions/mean_terminated_length": 273.21875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08522853534668684, + "epoch": 3.5140000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12777894735336304, + "learning_rate": 4.2159017845093346e-07, + "loss": 0.0014, + "num_tokens": 15336561.0, + "reward": 13.880290031433105, + "reward_std": 0.3749031126499176, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8323098421096802, + "rewards/length2tails_reward/std": 0.2247202843427658, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.34375, + "completions/mean_terminated_length": 272.34375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08242740109562874, + "epoch": 3.516, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10702885687351227, + "learning_rate": 4.205446368403052e-07, + "loss": -0.0004, + "num_tokens": 15345308.0, + "reward": 13.673827171325684, + "reward_std": 0.9926640391349792, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8025897741317749, + "rewards/length2tails_reward/std": 0.2218964546918869, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5332627296447754, + "rewards/thermo_reward/std": 0.8561917543411255, + "step": 1758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 270.90625, + "completions/mean_terminated_length": 270.90625, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 0.10247748345136642, + "epoch": 3.518, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5401629209518433, + "learning_rate": 4.195000479984264e-07, + "loss": 0.0074, + "num_tokens": 15354009.0, + "reward": 13.805831909179688, + "reward_std": 0.5156649351119995, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7601784467697144, + "rewards/length2tails_reward/std": 0.23915334045886993, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.0, + "completions/max_terminated_length": 628.0, + "completions/mean_length": 282.84375, + "completions/mean_terminated_length": 282.84375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08352225320413709, + "epoch": 3.52, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10053406655788422, + "learning_rate": 4.184564136428611e-07, + "loss": -0.0128, + "num_tokens": 15363092.0, + "reward": 13.87208366394043, + "reward_std": 0.375398188829422, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.750244677066803, + "rewards/length2tails_reward/std": 0.27107128500938416, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.5, + "completions/mean_terminated_length": 272.5, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09472567960619926, + "epoch": 3.5220000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13488849997520447, + "learning_rate": 4.174137354896039e-07, + "loss": -0.0018, + "num_tokens": 15371844.0, + "reward": 13.780675888061523, + "reward_std": 0.5977986454963684, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7822105884552002, + "rewards/length2tails_reward/std": 0.24083751440048218, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 274.125, + "completions/mean_terminated_length": 274.125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.07826209580525756, + "epoch": 3.524, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06917013227939606, + "learning_rate": 4.163720152530765e-07, + "loss": 0.0035, + "num_tokens": 15380648.0, + "reward": 13.924861907958984, + "reward_std": 0.3114674389362335, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8791612386703491, + "rewards/length2tails_reward/std": 0.19188782572746277, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.34375, + "completions/mean_terminated_length": 273.34375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09946441370993853, + "epoch": 3.526, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13247686624526978, + "learning_rate": 4.153312546461264e-07, + "loss": 0.0046, + "num_tokens": 15389427.0, + "reward": 13.841333389282227, + "reward_std": 0.4241860806941986, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8416051864624023, + "rewards/length2tails_reward/std": 0.1799822300672531, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.28125, + "completions/mean_terminated_length": 273.28125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08188586542382836, + "epoch": 3.528, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09976492822170258, + "learning_rate": 4.142914553800232e-07, + "loss": -0.0034, + "num_tokens": 15398204.0, + "reward": 13.013737678527832, + "reward_std": 4.373779296875, + "rewards/fitness_reward/mean": 7.029188632965088, + "rewards/fitness_reward/std": 1.8780547380447388, + "rewards/kidney_reward/mean": 2.406290054321289, + "rewards/kidney_reward/std": 0.9507110118865967, + "rewards/length2tails_reward/mean": 0.8287488222122192, + "rewards/length2tails_reward/std": 0.2133997231721878, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.395383596420288, + "rewards/thermo_reward/std": 1.5805878639221191, + "step": 1764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.1875, + "completions/mean_terminated_length": 272.1875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08668717928230762, + "epoch": 3.5300000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07370003312826157, + "learning_rate": 4.132526191644549e-07, + "loss": -0.0018, + "num_tokens": 15406946.0, + "reward": 13.925870895385742, + "reward_std": 0.3816991448402405, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7639847993850708, + "rewards/length2tails_reward/std": 0.2842487096786499, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7891664505004883, + "rewards/thermo_reward/std": 0.225629061460495, + "step": 1765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 676.0, + "completions/max_terminated_length": 676.0, + "completions/mean_length": 285.375, + "completions/mean_terminated_length": 285.375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09249016921967268, + "epoch": 3.532, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2883523404598236, + "learning_rate": 4.1221474770752696e-07, + "loss": -0.0193, + "num_tokens": 15416110.0, + "reward": 13.796365737915039, + "reward_std": 0.46921226382255554, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.790784478187561, + "rewards/length2tails_reward/std": 0.24667830765247345, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.34375, + "completions/mean_terminated_length": 272.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08595772087574005, + "epoch": 3.534, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07527884840965271, + "learning_rate": 4.111778427157585e-07, + "loss": -0.0068, + "num_tokens": 15424857.0, + "reward": 13.066018104553223, + "reward_std": 3.571143865585327, + "rewards/fitness_reward/mean": 6.99554443359375, + "rewards/fitness_reward/std": 1.7628074884414673, + "rewards/kidney_reward/mean": 2.4402666091918945, + "rewards/kidney_reward/std": 0.7342117428779602, + "rewards/length2tails_reward/mean": 0.7519161105155945, + "rewards/length2tails_reward/std": 0.31728798151016235, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4550156593322754, + "rewards/thermo_reward/std": 1.3517484664916992, + "step": 1767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.9375, + "completions/mean_terminated_length": 270.9375, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "entropy": 0.09108984749764204, + "epoch": 3.536, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2417386919260025, + "learning_rate": 4.101419058940786e-07, + "loss": -0.0202, + "num_tokens": 15433559.0, + "reward": 13.393560409545898, + "reward_std": 1.687468409538269, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.7939262986183167, + "rewards/length2tails_reward/std": 0.22488641738891602, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3085813522338867, + "rewards/thermo_reward/std": 1.4648016691207886, + "step": 1768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.5625, + "completions/mean_terminated_length": 273.5625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08521412871778011, + "epoch": 3.5380000000000003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10136404633522034, + "learning_rate": 4.0910693894582547e-07, + "loss": -0.0022, + "num_tokens": 15442345.0, + "reward": 13.412859916687012, + "reward_std": 1.929201364517212, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5046486854553223, + "rewards/kidney_reward/std": 0.5515561699867249, + "rewards/length2tails_reward/mean": 0.8514991998672485, + "rewards/length2tails_reward/std": 0.2045324146747589, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3618760108947754, + "rewards/thermo_reward/std": 1.4068026542663574, + "step": 1769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 271.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08393862191587687, + "epoch": 3.54, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11011866480112076, + "learning_rate": 4.0807294357274214e-07, + "loss": 0.0059, + "num_tokens": 15451063.0, + "reward": 13.429059982299805, + "reward_std": 2.498326301574707, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.650642991065979, + "rewards/kidney_reward/mean": 2.493276596069336, + "rewards/kidney_reward/std": 0.7535098791122437, + "rewards/length2tails_reward/mean": 0.7319997549057007, + "rewards/length2tails_reward/std": 0.26997649669647217, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.516416549682617, + "rewards/thermo_reward/std": 1.1344144344329834, + "step": 1770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.5625, + "completions/mean_terminated_length": 272.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08664077240973711, + "epoch": 3.542, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11247962713241577, + "learning_rate": 4.0703992147497426e-07, + "loss": -0.0015, + "num_tokens": 15459817.0, + "reward": 13.916162490844727, + "reward_std": 0.31563135981559753, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7921739816665649, + "rewards/length2tails_reward/std": 0.2422455996274948, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.08142776181921363, + "epoch": 3.544, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10102333128452301, + "learning_rate": 4.060078743510671e-07, + "loss": -0.0003, + "num_tokens": 15468552.0, + "reward": 13.715620040893555, + "reward_std": 0.5393779873847961, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7810434699058533, + "rewards/length2tails_reward/std": 0.19498248398303986, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5498504638671875, + "rewards/thermo_reward/std": 0.5360844731330872, + "step": 1772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.03125, + "completions/mean_terminated_length": 271.03125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08150176517665386, + "epoch": 3.5460000000000003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08746400475502014, + "learning_rate": 4.049768038979631e-07, + "loss": -0.0048, + "num_tokens": 15477257.0, + "reward": 13.692277908325195, + "reward_std": 1.6817798614501953, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5264596939086914, + "rewards/kidney_reward/std": 0.5657978057861328, + "rewards/length2tails_reward/mean": 0.7054793238639832, + "rewards/length2tails_reward/std": 0.2941136062145233, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.634085178375244, + "rewards/thermo_reward/std": 1.1029006242752075, + "step": 1773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.9375, + "completions/mean_terminated_length": 272.9375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09111022902652621, + "epoch": 3.548, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12175245583057404, + "learning_rate": 4.0394671181099783e-07, + "loss": -0.0007, + "num_tokens": 15486023.0, + "reward": 13.690387725830078, + "reward_std": 0.6123297214508057, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8023138046264648, + "rewards/length2tails_reward/std": 0.26038527488708496, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5498507022857666, + "rewards/thermo_reward/std": 0.5360844731330872, + "step": 1774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.46875, + "completions/mean_terminated_length": 272.46875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08034937083721161, + "epoch": 3.55, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09713394939899445, + "learning_rate": 4.029175997838995e-07, + "loss": -0.0137, + "num_tokens": 15494774.0, + "reward": 13.99374771118164, + "reward_std": 0.02433871291577816, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7703039050102234, + "rewards/length2tails_reward/std": 0.24338841438293457, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.829052448272705, + "rewards/thermo_reward/std": 0.0, + "step": 1775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.78125, + "completions/mean_terminated_length": 271.78125, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.09106822917237878, + "epoch": 3.552, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08475534617900848, + "learning_rate": 4.01889469508784e-07, + "loss": -0.0007, + "num_tokens": 15503503.0, + "reward": 13.563141822814941, + "reward_std": 1.5742127895355225, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.526200532913208, + "rewards/kidney_reward/std": 0.567264199256897, + "rewards/length2tails_reward/mean": 0.7878760099411011, + "rewards/length2tails_reward/std": 0.2587636709213257, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.496969223022461, + "rewards/thermo_reward/std": 1.0391004085540771, + "step": 1776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.08386838296428323, + "epoch": 3.5540000000000003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10282585024833679, + "learning_rate": 4.008623226761534e-07, + "loss": 0.0002, + "num_tokens": 15512234.0, + "reward": 13.76917839050293, + "reward_std": 0.5571933388710022, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7925007343292236, + "rewards/length2tails_reward/std": 0.2730444669723511, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.9375, + "completions/mean_terminated_length": 270.9375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 0.0853539565578103, + "epoch": 3.556, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10087975859642029, + "learning_rate": 3.998361609748928e-07, + "loss": -0.0026, + "num_tokens": 15520936.0, + "reward": 13.262292861938477, + "reward_std": 2.375058889389038, + "rewards/fitness_reward/mean": 7.052567481994629, + "rewards/fitness_reward/std": 1.7458053827285767, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7606244683265686, + "rewards/length2tails_reward/std": 0.27506399154663086, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4619014263153076, + "rewards/thermo_reward/std": 1.0260679721832275, + "step": 1778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.08862397447228432, + "epoch": 3.558, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10122814774513245, + "learning_rate": 3.988109860922666e-07, + "loss": 0.0033, + "num_tokens": 15529700.0, + "reward": 13.856950759887695, + "reward_std": 0.4788967967033386, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8725180625915527, + "rewards/length2tails_reward/std": 0.21750660240650177, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 274.59375, + "completions/mean_terminated_length": 274.59375, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.08838807186111808, + "epoch": 3.56, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3841480612754822, + "learning_rate": 3.977867997139178e-07, + "loss": -0.0002, + "num_tokens": 15538519.0, + "reward": 13.84387493133545, + "reward_std": 0.4256982207298279, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8670163154602051, + "rewards/length2tails_reward/std": 0.1517139971256256, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.0, + "completions/mean_terminated_length": 272.0, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.08908396679908037, + "epoch": 3.5620000000000003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1281057596206665, + "learning_rate": 3.967636035238635e-07, + "loss": 0.0055, + "num_tokens": 15547255.0, + "reward": 12.93602180480957, + "reward_std": 5.5553297996521, + "rewards/fitness_reward/mean": 6.948566436767578, + "rewards/fitness_reward/std": 2.3341219425201416, + "rewards/kidney_reward/mean": 2.361267328262329, + "rewards/kidney_reward/std": 1.500267744064331, + "rewards/length2tails_reward/mean": 0.8272075653076172, + "rewards/length2tails_reward/std": 0.20161886513233185, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.443466901779175, + "rewards/thermo_reward/std": 1.7438091039657593, + "step": 1781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.15625, + "completions/mean_terminated_length": 271.15625, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "entropy": 0.08590043894946575, + "epoch": 3.564, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12525677680969238, + "learning_rate": 3.9574139920449267e-07, + "loss": 0.0048, + "num_tokens": 15555964.0, + "reward": 13.8359375, + "reward_std": 0.4279092848300934, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7876396179199219, + "rewards/length2tails_reward/std": 0.22690580785274506, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 274.34375, + "completions/mean_terminated_length": 274.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08884886186569929, + "epoch": 3.566, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09117615222930908, + "learning_rate": 3.947201884365639e-07, + "loss": -0.0038, + "num_tokens": 15564775.0, + "reward": 13.65673828125, + "reward_std": 1.285060167312622, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7954223155975342, + "rewards/length2tails_reward/std": 0.26181408762931824, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.516890525817871, + "rewards/thermo_reward/std": 1.1318851709365845, + "step": 1783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.28125, + "completions/mean_terminated_length": 270.28125, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "entropy": 0.08355098683387041, + "epoch": 3.568, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09611167013645172, + "learning_rate": 3.9369997289920085e-07, + "loss": -0.0048, + "num_tokens": 15573456.0, + "reward": 13.460000991821289, + "reward_std": 1.3074196577072144, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.814959704875946, + "rewards/length2tails_reward/std": 0.24196474254131317, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3181991577148438, + "rewards/thermo_reward/std": 1.271405816078186, + "step": 1784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.59375, + "completions/mean_terminated_length": 272.59375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09087568428367376, + "epoch": 3.57, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12429216504096985, + "learning_rate": 3.926807542698922e-07, + "loss": 0.003, + "num_tokens": 15582211.0, + "reward": 13.4776029586792, + "reward_std": 0.94354647397995, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8276777863502502, + "rewards/length2tails_reward/std": 0.20748263597488403, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.334529399871826, + "rewards/thermo_reward/std": 0.9096359610557556, + "step": 1785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 271.75, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0920605594292283, + "epoch": 3.572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10412466526031494, + "learning_rate": 3.916625342244868e-07, + "loss": 0.0016, + "num_tokens": 15590939.0, + "reward": 13.483139038085938, + "reward_std": 1.349585771560669, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7391869425773621, + "rewards/length2tails_reward/std": 0.28878656029701233, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4064245223999023, + "rewards/thermo_reward/std": 0.9289711713790894, + "step": 1786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 287.40625, + "completions/mean_terminated_length": 272.3548278808594, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09477597661316395, + "epoch": 3.574, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0864534378051758, + "learning_rate": 3.9064531443719194e-07, + "loss": -0.0165, + "num_tokens": 15600168.0, + "reward": 13.918493270874023, + "reward_std": 0.31318578124046326, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8154876232147217, + "rewards/length2tails_reward/std": 0.2113732397556305, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.3125, + "completions/mean_terminated_length": 273.3125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08148432523012161, + "epoch": 3.576, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0958944708108902, + "learning_rate": 3.8962909658056944e-07, + "loss": -0.0071, + "num_tokens": 15608946.0, + "reward": 13.712947845458984, + "reward_std": 0.8171798586845398, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8052818775177002, + "rewards/length2tails_reward/std": 0.2560892701148987, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.71875, + "completions/mean_terminated_length": 273.71875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08467878680676222, + "epoch": 3.578, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06395456194877625, + "learning_rate": 3.886138823255348e-07, + "loss": -0.0045, + "num_tokens": 15617737.0, + "reward": 13.70904541015625, + "reward_std": 1.2468868494033813, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8645087480545044, + "rewards/length2tails_reward/std": 0.17950421571731567, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5622894763946533, + "rewards/thermo_reward/std": 1.0893287658691406, + "step": 1789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.09077019430696964, + "epoch": 3.58, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10219375044107437, + "learning_rate": 3.8759967334135214e-07, + "loss": 0.0043, + "num_tokens": 15626485.0, + "reward": 12.912355422973633, + "reward_std": 4.981294631958008, + "rewards/fitness_reward/mean": 6.975485801696777, + "rewards/fitness_reward/std": 2.18184494972229, + "rewards/kidney_reward/mean": 2.4065518379211426, + "rewards/kidney_reward/std": 1.244099736213684, + "rewards/length2tails_reward/mean": 0.788223147392273, + "rewards/length2tails_reward/std": 0.2414843589067459, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3514962196350098, + "rewards/thermo_reward/std": 1.6070235967636108, + "step": 1790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 629.0, + "completions/max_terminated_length": 629.0, + "completions/mean_length": 283.28125, + "completions/mean_terminated_length": 283.28125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08667771378532052, + "epoch": 3.582, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4496097266674042, + "learning_rate": 3.865864712956336e-07, + "loss": -0.0498, + "num_tokens": 15635582.0, + "reward": 13.995429992675781, + "reward_std": 0.02127438597381115, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.787126898765564, + "rewards/length2tails_reward/std": 0.21274513006210327, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.829052448272705, + "rewards/thermo_reward/std": 0.0, + "step": 1791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.75, + "completions/mean_terminated_length": 273.75, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.08310918603092432, + "epoch": 3.584, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09372947365045547, + "learning_rate": 3.8557427785433536e-07, + "loss": 0.0049, + "num_tokens": 15644374.0, + "reward": 13.748785018920898, + "reward_std": 0.6206957101821899, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8621543645858765, + "rewards/length2tails_reward/std": 0.21344423294067383, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.21875, + "completions/mean_terminated_length": 272.21875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08902006130665541, + "epoch": 3.586, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10767589509487152, + "learning_rate": 3.8456309468175527e-07, + "loss": 0.0, + "num_tokens": 15653117.0, + "reward": 13.834826469421387, + "reward_std": 0.4260368347167969, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7765315771102905, + "rewards/length2tails_reward/std": 0.23316825926303864, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.08544992376118898, + "epoch": 3.588, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2308015078306198, + "learning_rate": 3.835529234405303e-07, + "loss": -0.0079, + "num_tokens": 15661841.0, + "reward": 13.996078491210938, + "reward_std": 0.022238049656152725, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7936158180236816, + "rewards/length2tails_reward/std": 0.22238068282604218, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.829052448272705, + "rewards/thermo_reward/std": 0.0, + "step": 1794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 272.5, + "completions/mean_terminated_length": 272.5, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08660221099853516, + "epoch": 3.59, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14856624603271484, + "learning_rate": 3.825437657916325e-07, + "loss": 0.001, + "num_tokens": 15670593.0, + "reward": 13.877517700195312, + "reward_std": 0.37583112716674805, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8045819401741028, + "rewards/length2tails_reward/std": 0.22768016159534454, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.625, + "completions/mean_terminated_length": 272.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09035045560449362, + "epoch": 3.592, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10779079794883728, + "learning_rate": 3.815356233943685e-07, + "loss": -0.0057, + "num_tokens": 15679349.0, + "reward": 13.757761001586914, + "reward_std": 0.5190196633338928, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8035960793495178, + "rewards/length2tails_reward/std": 0.27802371978759766, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 1796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 285.75, + "completions/mean_terminated_length": 270.6451416015625, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.10499814618378878, + "epoch": 3.594, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3619617223739624, + "learning_rate": 3.805284979063752e-07, + "loss": -0.0203, + "num_tokens": 15688525.0, + "reward": 13.837220191955566, + "reward_std": 0.42661750316619873, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8004651665687561, + "rewards/length2tails_reward/std": 0.24887455999851227, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.0625, + "completions/mean_terminated_length": 271.0625, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "entropy": 0.08877876028418541, + "epoch": 3.596, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09175974875688553, + "learning_rate": 3.7952239098361726e-07, + "loss": 0.0021, + "num_tokens": 15697231.0, + "reward": 13.38138198852539, + "reward_std": 1.7214365005493164, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5350708961486816, + "rewards/kidney_reward/std": 0.5170859098434448, + "rewards/length2tails_reward/mean": 0.8196229934692383, + "rewards/length2tails_reward/std": 0.20097720623016357, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3031632900238037, + "rewards/thermo_reward/std": 1.2440290451049805, + "step": 1798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08055632095783949, + "epoch": 3.598, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09719519317150116, + "learning_rate": 3.7851730428038473e-07, + "loss": -0.0062, + "num_tokens": 15705995.0, + "reward": 13.470977783203125, + "reward_std": 1.5660966634750366, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8105138540267944, + "rewards/length2tails_reward/std": 0.26108792424201965, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.356980562210083, + "rewards/thermo_reward/std": 1.4137026071548462, + "step": 1799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.84375, + "completions/mean_terminated_length": 269.84375, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "entropy": 0.08724918495863676, + "epoch": 3.6, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.288025826215744, + "learning_rate": 3.7751323944929057e-07, + "loss": 0.008, + "num_tokens": 15714662.0, + "reward": 13.290872573852539, + "reward_std": 2.4588677883148193, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.464883327484131, + "rewards/kidney_reward/std": 0.7700634002685547, + "rewards/length2tails_reward/mean": 0.8100026249885559, + "rewards/length2tails_reward/std": 0.21141447126865387, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3413124084472656, + "rewards/thermo_reward/std": 1.4779037237167358, + "step": 1800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.03125, + "completions/mean_terminated_length": 273.03125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08742036391049623, + "epoch": 3.602, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21453461050987244, + "learning_rate": 3.765101981412665e-07, + "loss": 0.0058, + "num_tokens": 15723431.0, + "reward": 13.496167182922363, + "reward_std": 1.5324259996414185, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5327911376953125, + "rewards/kidney_reward/std": 0.5299828052520752, + "rewards/length2tails_reward/mean": 0.828900933265686, + "rewards/length2tails_reward/std": 0.20199020206928253, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4193015098571777, + "rewards/thermo_reward/std": 1.0495328903198242, + "step": 1801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.875, + "completions/mean_terminated_length": 273.875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08902319148182869, + "epoch": 3.604, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11332407593727112, + "learning_rate": 3.755081820055621e-07, + "loss": 0.0011, + "num_tokens": 15732227.0, + "reward": 13.84552001953125, + "reward_std": 0.42908626794815063, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8834710121154785, + "rewards/length2tails_reward/std": 0.16791842877864838, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.5625, + "completions/mean_terminated_length": 270.5625, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "entropy": 0.08871565293520689, + "epoch": 3.606, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07865909487009048, + "learning_rate": 3.7450719268974127e-07, + "loss": -0.0011, + "num_tokens": 15740917.0, + "reward": 12.999435424804688, + "reward_std": 4.383616924285889, + "rewards/fitness_reward/mean": 7.023216724395752, + "rewards/fitness_reward/std": 1.911836862564087, + "rewards/kidney_reward/mean": 2.4475834369659424, + "rewards/kidney_reward/std": 1.011989712715149, + "rewards/length2tails_reward/mean": 0.8283498287200928, + "rewards/length2tails_reward/std": 0.18109019100666046, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.345799446105957, + "rewards/thermo_reward/std": 1.6405893564224243, + "step": 1803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.78125, + "completions/mean_terminated_length": 273.78125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08313896227627993, + "epoch": 3.608, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13741354644298553, + "learning_rate": 3.7350723183967935e-07, + "loss": -0.001, + "num_tokens": 15749710.0, + "reward": 13.44559097290039, + "reward_std": 1.4326897859573364, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8529207110404968, + "rewards/length2tails_reward/std": 0.19226661324501038, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3273532390594482, + "rewards/thermo_reward/std": 1.2326303720474243, + "step": 1804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08045259164646268, + "epoch": 3.61, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10184618085622787, + "learning_rate": 3.725083010995611e-07, + "loss": -0.0012, + "num_tokens": 15758454.0, + "reward": 13.835386276245117, + "reward_std": 0.43626150488853455, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7821381688117981, + "rewards/length2tails_reward/std": 0.2621596157550812, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.59375, + "completions/mean_terminated_length": 272.59375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08251079404726624, + "epoch": 3.612, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12113630771636963, + "learning_rate": 3.715104021118763e-07, + "loss": 0.0001, + "num_tokens": 15767209.0, + "reward": 13.416242599487305, + "reward_std": 2.024634599685669, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.463491201400757, + "rewards/kidney_reward/std": 0.7777802348136902, + "rewards/length2tails_reward/mean": 0.7820241451263428, + "rewards/length2tails_reward/std": 0.26788899302482605, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4133644104003906, + "rewards/thermo_reward/std": 1.27640962600708, + "step": 1806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 274.21875, + "completions/mean_terminated_length": 274.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08606186509132385, + "epoch": 3.614, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15508778393268585, + "learning_rate": 3.705135365174197e-07, + "loss": 0.0021, + "num_tokens": 15776016.0, + "reward": 13.849562644958496, + "reward_std": 0.4784981906414032, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7986260652542114, + "rewards/length2tails_reward/std": 0.26385998725891113, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.625, + "completions/mean_terminated_length": 272.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0820772061124444, + "epoch": 3.616, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14850305020809174, + "learning_rate": 3.6951770595528606e-07, + "loss": 0.0002, + "num_tokens": 15784772.0, + "reward": 12.774524688720703, + "reward_std": 4.888601303100586, + "rewards/fitness_reward/mean": 6.999087333679199, + "rewards/fitness_reward/std": 2.048335552215576, + "rewards/kidney_reward/mean": 2.341603994369507, + "rewards/kidney_reward/std": 1.1970475912094116, + "rewards/length2tails_reward/mean": 0.8276439905166626, + "rewards/length2tails_reward/std": 0.22841252386569977, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2510693073272705, + "rewards/thermo_reward/std": 1.8503024578094482, + "step": 1808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 272.84375, + "completions/mean_terminated_length": 272.84375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08443187456578016, + "epoch": 3.618, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1716132014989853, + "learning_rate": 3.685229120628688e-07, + "loss": 0.0012, + "num_tokens": 15793535.0, + "reward": 13.70191478729248, + "reward_std": 0.8540881872177124, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7989389300346375, + "rewards/length2tails_reward/std": 0.24277861416339874, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.534355878829956, + "rewards/thermo_reward/std": 0.8508411049842834, + "step": 1809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 270.65625, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 0.09030141588300467, + "epoch": 3.62, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16818749904632568, + "learning_rate": 3.6752915647585646e-07, + "loss": 0.0007, + "num_tokens": 15802228.0, + "reward": 13.565706253051758, + "reward_std": 1.2407891750335693, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7469499111175537, + "rewards/length2tails_reward/std": 0.29119476675987244, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4307057857513428, + "rewards/thermo_reward/std": 1.130438208580017, + "step": 1810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.0625, + "completions/mean_terminated_length": 273.0625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.07465990725904703, + "epoch": 3.622, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08960974216461182, + "learning_rate": 3.665364408282304e-07, + "loss": 0.0034, + "num_tokens": 15810998.0, + "reward": 13.920299530029297, + "reward_std": 0.31105008721351624, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8335549831390381, + "rewards/length2tails_reward/std": 0.20239099860191345, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.90625, + "completions/mean_terminated_length": 272.90625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0867516491562128, + "epoch": 3.624, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07420986890792847, + "learning_rate": 3.6554476675226156e-07, + "loss": 0.0002, + "num_tokens": 15819763.0, + "reward": 13.589237213134766, + "reward_std": 1.4417500495910645, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8086227774620056, + "rewards/length2tails_reward/std": 0.24868273735046387, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5055789947509766, + "rewards/thermo_reward/std": 0.9949191212654114, + "step": 1812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 271.21875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08404396940022707, + "epoch": 3.626, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11983322352170944, + "learning_rate": 3.6455413587850926e-07, + "loss": -0.003, + "num_tokens": 15828474.0, + "reward": 13.836545944213867, + "reward_std": 0.8850547075271606, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7509810924530029, + "rewards/length2tails_reward/std": 0.19866718351840973, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.673783779144287, + "rewards/thermo_reward/std": 0.8783318996429443, + "step": 1813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.09375, + "completions/mean_terminated_length": 273.09375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.07894755946472287, + "epoch": 3.628, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14280594885349274, + "learning_rate": 3.6356454983581695e-07, + "loss": 0.0019, + "num_tokens": 15837245.0, + "reward": 13.78122329711914, + "reward_std": 0.5930385589599609, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.787685215473175, + "rewards/length2tails_reward/std": 0.26022645831108093, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.34375, + "completions/mean_terminated_length": 273.34375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.0844585420563817, + "epoch": 3.63, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09027701616287231, + "learning_rate": 3.625760102513102e-07, + "loss": -0.0016, + "num_tokens": 15846024.0, + "reward": 13.760713577270508, + "reward_std": 0.5119152069091797, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8331196904182434, + "rewards/length2tails_reward/std": 0.20415940880775452, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061467885971069, + "step": 1815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.1875, + "completions/mean_terminated_length": 273.1875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08118011150509119, + "epoch": 3.632, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08081858605146408, + "learning_rate": 3.6158851875039456e-07, + "loss": -0.003, + "num_tokens": 15854798.0, + "reward": 13.591593742370605, + "reward_std": 1.0339016914367676, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7975776195526123, + "rewards/length2tails_reward/std": 0.2879304885864258, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4515304565429688, + "rewards/thermo_reward/std": 0.8939418792724609, + "step": 1816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.125, + "completions/mean_terminated_length": 273.125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.0785574521869421, + "epoch": 3.634, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16642813384532928, + "learning_rate": 3.606020769567507e-07, + "loss": 0.0061, + "num_tokens": 15863570.0, + "reward": 13.730648040771484, + "reward_std": 0.5763334035873413, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8060550093650818, + "rewards/length2tails_reward/std": 0.2286890745162964, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 1817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08207452390342951, + "epoch": 3.636, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1245543584227562, + "learning_rate": 3.596166864923348e-07, + "loss": -0.0038, + "num_tokens": 15872301.0, + "reward": 13.270090103149414, + "reward_std": 3.044645071029663, + "rewards/fitness_reward/mean": 7.052626132965088, + "rewards/fitness_reward/std": 1.7454723119735718, + "rewards/kidney_reward/mean": 2.5107221603393555, + "rewards/kidney_reward/std": 0.5186686515808105, + "rewards/length2tails_reward/mean": 0.7430772185325623, + "rewards/length2tails_reward/std": 0.29433321952819824, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.53243350982666, + "rewards/thermo_reward/std": 0.8602582216262817, + "step": 1818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.28125, + "completions/mean_terminated_length": 272.28125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.085698701441288, + "epoch": 3.638, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3150479793548584, + "learning_rate": 3.586323489773739e-07, + "loss": -0.001, + "num_tokens": 15881046.0, + "reward": 13.878179550170898, + "reward_std": 0.3804786503314972, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8112087845802307, + "rewards/length2tails_reward/std": 0.21879929304122925, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.21875, + "completions/mean_terminated_length": 273.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09197020716965199, + "epoch": 3.64, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16853182017803192, + "learning_rate": 3.576490660303636e-07, + "loss": -0.0029, + "num_tokens": 15889821.0, + "reward": 13.605411529541016, + "reward_std": 1.368744969367981, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.825190544128418, + "rewards/length2tails_reward/std": 0.22926637530326843, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4625871181488037, + "rewards/thermo_reward/std": 1.2189743518829346, + "step": 1820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.78125, + "completions/mean_terminated_length": 272.78125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08115828037261963, + "epoch": 3.642, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08074554055929184, + "learning_rate": 3.566668392680662e-07, + "loss": -0.0062, + "num_tokens": 15898582.0, + "reward": 13.297192573547363, + "reward_std": 2.0624260902404785, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8180257678031921, + "rewards/length2tails_reward/std": 0.22408005595207214, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1550843715667725, + "rewards/thermo_reward/std": 1.9389499425888062, + "step": 1821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0853598746471107, + "epoch": 3.644, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08734510093927383, + "learning_rate": 3.5568567030550577e-07, + "loss": -0.0006, + "num_tokens": 15907306.0, + "reward": 13.05596923828125, + "reward_std": 2.8374533653259277, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.4475936889648438, + "rewards/kidney_reward/std": 0.8660976886749268, + "rewards/length2tails_reward/mean": 0.7015992403030396, + "rewards/length2tails_reward/std": 0.31296905875205994, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1345396041870117, + "rewards/thermo_reward/std": 1.8032125234603882, + "step": 1822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08595753367990255, + "epoch": 3.646, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10818291455507278, + "learning_rate": 3.547055607559688e-07, + "loss": 0.0033, + "num_tokens": 15916050.0, + "reward": 13.50337028503418, + "reward_std": 1.7344375848770142, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.530332565307617, + "rewards/kidney_reward/std": 0.5438900589942932, + "rewards/length2tails_reward/mean": 0.7964829802513123, + "rewards/length2tails_reward/std": 0.2236173003911972, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.432204484939575, + "rewards/thermo_reward/std": 1.280646800994873, + "step": 1823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.65625, + "completions/mean_terminated_length": 272.65625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08719247533008456, + "epoch": 3.648, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.43279126286506653, + "learning_rate": 3.5372651223099915e-07, + "loss": -0.002, + "num_tokens": 15924807.0, + "reward": 12.916872024536133, + "reward_std": 4.81889533996582, + "rewards/fitness_reward/mean": 6.978363990783691, + "rewards/fitness_reward/std": 2.165562629699707, + "rewards/kidney_reward/mean": 2.306910991668701, + "rewards/kidney_reward/std": 1.3264926671981812, + "rewards/length2tails_reward/mean": 0.7791447043418884, + "rewards/length2tails_reward/std": 0.2675744891166687, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4536831378936768, + "rewards/thermo_reward/std": 1.473813772201538, + "step": 1824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08192221820354462, + "epoch": 3.65, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09442069381475449, + "learning_rate": 3.52748526340396e-07, + "loss": 0.0041, + "num_tokens": 15933565.0, + "reward": 13.876182556152344, + "reward_std": 0.3726004660129547, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7912297248840332, + "rewards/length2tails_reward/std": 0.27527180314064026, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.25, + "completions/mean_terminated_length": 273.25, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08800566708669066, + "epoch": 3.652, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15255124866962433, + "learning_rate": 3.5177160469221176e-07, + "loss": 0.0039, + "num_tokens": 15942341.0, + "reward": 13.62850284576416, + "reward_std": 1.0358679294586182, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8195139169692993, + "rewards/length2tails_reward/std": 0.21823741495609283, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4862453937530518, + "rewards/thermo_reward/std": 0.9058089852333069, + "step": 1826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 271.4375, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "entropy": 0.09326956886798143, + "epoch": 3.654, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14029085636138916, + "learning_rate": 3.50795748892748e-07, + "loss": -0.0016, + "num_tokens": 15951059.0, + "reward": 13.559985160827637, + "reward_std": 1.790492296218872, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5264534950256348, + "rewards/kidney_reward/std": 0.5658333897590637, + "rewards/length2tails_reward/mean": 0.7611898183822632, + "rewards/length2tails_reward/std": 0.2647976577281952, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.496227741241455, + "rewards/thermo_reward/std": 1.2426968812942505, + "step": 1827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.28125, + "completions/mean_terminated_length": 271.28125, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "entropy": 0.08237532759085298, + "epoch": 3.656, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08693405240774155, + "learning_rate": 3.4982096054655477e-07, + "loss": -0.0036, + "num_tokens": 15959772.0, + "reward": 13.295555114746094, + "reward_std": 3.090214252471924, + "rewards/fitness_reward/mean": 6.941235542297363, + "rewards/fitness_reward/std": 2.0655312538146973, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7664594650268555, + "rewards/length2tails_reward/std": 0.3026435375213623, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.605912208557129, + "rewards/thermo_reward/std": 0.8559578061103821, + "step": 1828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.0625, + "completions/mean_terminated_length": 272.0625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08308725710958242, + "epoch": 3.658, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23046323657035828, + "learning_rate": 3.488472412564264e-07, + "loss": 0.0061, + "num_tokens": 15968510.0, + "reward": 13.662422180175781, + "reward_std": 1.0406625270843506, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7909877300262451, + "rewards/length2tails_reward/std": 0.2367888242006302, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5230178833007812, + "rewards/thermo_reward/std": 0.9068526029586792, + "step": 1829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.5, + "completions/mean_terminated_length": 271.5, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.07731601735576987, + "epoch": 3.66, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24017341434955597, + "learning_rate": 3.478745926233998e-07, + "loss": -0.0039, + "num_tokens": 15977230.0, + "reward": 13.384349822998047, + "reward_std": 3.007572889328003, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.5390896797180176, + "rewards/kidney_reward/std": 0.49435171484947205, + "rewards/length2tails_reward/mean": 0.7631849050521851, + "rewards/length2tails_reward/std": 0.2363831102848053, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.6158881187438965, + "rewards/thermo_reward/std": 0.8037141561508179, + "step": 1830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.59375, + "completions/mean_terminated_length": 271.59375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08287757448852062, + "epoch": 3.662, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07217924296855927, + "learning_rate": 3.4690301624675123e-07, + "loss": -0.0047, + "num_tokens": 15985953.0, + "reward": 13.908952713012695, + "reward_std": 0.31878843903541565, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.720072865486145, + "rewards/length2tails_reward/std": 0.30547112226486206, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.07862121984362602, + "epoch": 3.664, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09833700209856033, + "learning_rate": 3.4593251372399414e-07, + "loss": 0.0056, + "num_tokens": 15994677.0, + "reward": 13.86894416809082, + "reward_std": 0.3705652952194214, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7188472151756287, + "rewards/length2tails_reward/std": 0.26971179246902466, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.84375, + "completions/mean_terminated_length": 272.84375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.07587908860296011, + "epoch": 3.666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09324633330106735, + "learning_rate": 3.449630866508757e-07, + "loss": -0.0022, + "num_tokens": 16003440.0, + "reward": 13.91788101196289, + "reward_std": 0.3153970539569855, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.809357762336731, + "rewards/length2tails_reward/std": 0.21493566036224365, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.59375, + "completions/mean_terminated_length": 271.59375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.07948243711143732, + "epoch": 3.668, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12330475449562073, + "learning_rate": 3.4399473662137514e-07, + "loss": -0.0001, + "num_tokens": 16012163.0, + "reward": 13.845161437988281, + "reward_std": 0.4772983491420746, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7546184062957764, + "rewards/length2tails_reward/std": 0.2744663953781128, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.59375, + "completions/mean_terminated_length": 271.59375, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "entropy": 0.08797703590244055, + "epoch": 3.67, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1077851727604866, + "learning_rate": 3.4302746522770076e-07, + "loss": 0.0025, + "num_tokens": 16020886.0, + "reward": 13.835163116455078, + "reward_std": 0.42623722553253174, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7799012064933777, + "rewards/length2tails_reward/std": 0.2840479612350464, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 274.96875, + "completions/mean_terminated_length": 274.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08596474630758166, + "epoch": 3.672, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1660880595445633, + "learning_rate": 3.420612740602874e-07, + "loss": -0.0045, + "num_tokens": 16029717.0, + "reward": 13.80670166015625, + "reward_std": 0.5210345983505249, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7688755989074707, + "rewards/length2tails_reward/std": 0.2551930248737335, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.08587139938026667, + "epoch": 3.674, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12363027781248093, + "learning_rate": 3.410961647077939e-07, + "loss": 0.0009, + "num_tokens": 16038465.0, + "reward": 13.799466133117676, + "reward_std": 0.47291746735572815, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.821791410446167, + "rewards/length2tails_reward/std": 0.20731790363788605, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.75, + "completions/mean_terminated_length": 272.75, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.07770541356876493, + "epoch": 3.676, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06414253264665604, + "learning_rate": 3.401321387571001e-07, + "loss": -0.0052, + "num_tokens": 16047225.0, + "reward": 13.58360481262207, + "reward_std": 2.1439197063446045, + "rewards/fitness_reward/mean": 7.052915573120117, + "rewards/fitness_reward/std": 1.7438353300094604, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.822883129119873, + "rewards/length2tails_reward/std": 0.2441834658384323, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.0891623767092824, + "epoch": 3.678, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1793835461139679, + "learning_rate": 3.39169197793304e-07, + "loss": -0.0042, + "num_tokens": 16055983.0, + "reward": 13.732637405395508, + "reward_std": 0.5838374495506287, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8259539604187012, + "rewards/length2tails_reward/std": 0.24140463769435883, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 1839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.03125, + "completions/mean_terminated_length": 273.03125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.09458326548337936, + "epoch": 3.68, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1907602846622467, + "learning_rate": 3.3820734339972036e-07, + "loss": 0.0045, + "num_tokens": 16064752.0, + "reward": 13.851795196533203, + "reward_std": 0.4789268374443054, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8209490776062012, + "rewards/length2tails_reward/std": 0.2316625416278839, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 274.4375, + "completions/mean_terminated_length": 274.4375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.0811676699668169, + "epoch": 3.682, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0921485498547554, + "learning_rate": 3.372465771578771e-07, + "loss": 0.0031, + "num_tokens": 16073566.0, + "reward": 13.846429824829102, + "reward_std": 0.42756423354148865, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8925660252571106, + "rewards/length2tails_reward/std": 0.1536208838224411, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.3125, + "completions/mean_terminated_length": 272.3125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08341225422918797, + "epoch": 3.684, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10851159691810608, + "learning_rate": 3.362869006475126e-07, + "loss": 0.0031, + "num_tokens": 16082312.0, + "reward": 13.754069328308105, + "reward_std": 0.5051019787788391, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7666773796081543, + "rewards/length2tails_reward/std": 0.303233802318573, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 1842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.78125, + "completions/mean_terminated_length": 271.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08298153709620237, + "epoch": 3.686, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13726581633090973, + "learning_rate": 3.3532831544657456e-07, + "loss": 0.0009, + "num_tokens": 16091041.0, + "reward": 13.767756462097168, + "reward_std": 0.5509318113327026, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.778285562992096, + "rewards/length2tails_reward/std": 0.24406449496746063, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.21875, + "completions/mean_terminated_length": 273.21875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.0828853053972125, + "epoch": 3.6879999999999997, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.070669025182724, + "learning_rate": 3.3437082313121447e-07, + "loss": 0.0032, + "num_tokens": 16099816.0, + "reward": 13.959261894226074, + "reward_std": 0.22338415682315826, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8243112564086914, + "rewards/length2tails_reward/std": 0.20171445608139038, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7891664505004883, + "rewards/thermo_reward/std": 0.225629061460495, + "step": 1844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.625, + "completions/mean_terminated_length": 272.625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.088692725636065, + "epoch": 3.69, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06675875186920166, + "learning_rate": 3.3341442527578835e-07, + "loss": -0.0004, + "num_tokens": 16108572.0, + "reward": 13.817087173461914, + "reward_std": 0.8259702324867249, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8217202425003052, + "rewards/length2tails_reward/std": 0.2654772996902466, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.6472504138946533, + "rewards/thermo_reward/std": 0.8268661499023438, + "step": 1845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.90625, + "completions/mean_terminated_length": 273.90625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08372374624013901, + "epoch": 3.692, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08932638168334961, + "learning_rate": 3.3245912345285197e-07, + "loss": -0.0023, + "num_tokens": 16117369.0, + "reward": 13.630112648010254, + "reward_std": 1.149518370628357, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8636769652366638, + "rewards/length2tails_reward/std": 0.16939999163150787, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.510798454284668, + "rewards/thermo_reward/std": 0.9683473706245422, + "step": 1846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.59375, + "completions/mean_terminated_length": 272.59375, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.08410823810845613, + "epoch": 3.694, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17860926687717438, + "learning_rate": 3.315049192331595e-07, + "loss": -0.0142, + "num_tokens": 16126124.0, + "reward": 13.166458129882812, + "reward_std": 2.05751371383667, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8351097702980042, + "rewards/length2tails_reward/std": 0.23980149626731873, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.050001621246338, + "rewards/thermo_reward/std": 1.8831400871276855, + "step": 1847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.1875, + "completions/mean_terminated_length": 273.1875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08229447342455387, + "epoch": 3.6959999999999997, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09073762595653534, + "learning_rate": 3.3055181418566e-07, + "loss": -0.0009, + "num_tokens": 16134898.0, + "reward": 13.835367202758789, + "reward_std": 0.4329942762851715, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7819371223449707, + "rewards/length2tails_reward/std": 0.26615023612976074, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 273.53125, + "completions/mean_terminated_length": 273.53125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08659685775637627, + "epoch": 3.698, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.47516101598739624, + "learning_rate": 3.2959980987749483e-07, + "loss": 0.0124, + "num_tokens": 16143683.0, + "reward": 13.798820495605469, + "reward_std": 0.46693822741508484, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8153313994407654, + "rewards/length2tails_reward/std": 0.24201750755310059, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.07372273737564683, + "epoch": 3.7, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.049018096178770065, + "learning_rate": 3.28648907873996e-07, + "loss": -0.0049, + "num_tokens": 16152418.0, + "reward": 13.620929718017578, + "reward_std": 1.8910012245178223, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5256857872009277, + "rewards/kidney_reward/std": 0.5701754689216614, + "rewards/length2tails_reward/mean": 0.7492478489875793, + "rewards/length2tails_reward/std": 0.271221786737442, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.559133529663086, + "rewards/thermo_reward/std": 1.3134887218475342, + "step": 1850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.78125, + "completions/mean_terminated_length": 272.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.07341090962290764, + "epoch": 3.702, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10206151008605957, + "learning_rate": 3.276991097386831e-07, + "loss": -0.0069, + "num_tokens": 16161179.0, + "reward": 13.129651069641113, + "reward_std": 2.6763594150543213, + "rewards/fitness_reward/mean": 6.988076210021973, + "rewards/fitness_reward/std": 2.110623359680176, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.767514169216156, + "rewards/length2tails_reward/std": 0.30354535579681396, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.393061637878418, + "rewards/thermo_reward/std": 0.99179607629776, + "step": 1851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.07788072619587183, + "epoch": 3.7039999999999997, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10819029062986374, + "learning_rate": 3.2675041703326046e-07, + "loss": -0.0037, + "num_tokens": 16169927.0, + "reward": 13.877944946289062, + "reward_std": 0.38620296120643616, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8088620901107788, + "rewards/length2tails_reward/std": 0.21891973912715912, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.1875, + "completions/mean_terminated_length": 271.1875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.07938122935593128, + "epoch": 3.706, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07176394760608673, + "learning_rate": 3.258028313176151e-07, + "loss": -0.0005, + "num_tokens": 16178637.0, + "reward": 13.868053436279297, + "reward_std": 0.3806667625904083, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7099429368972778, + "rewards/length2tails_reward/std": 0.28471970558166504, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.0, + "completions/mean_terminated_length": 273.0, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.08443743642419577, + "epoch": 3.708, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11585229635238647, + "learning_rate": 3.2485635414981315e-07, + "loss": -0.0001, + "num_tokens": 16187405.0, + "reward": 13.549070358276367, + "reward_std": 1.1941407918930054, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8482940196990967, + "rewards/length2tails_reward/std": 0.22004073858261108, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4312944412231445, + "rewards/thermo_reward/std": 1.127699613571167, + "step": 1854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.46875, + "completions/mean_terminated_length": 271.46875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08564404863864183, + "epoch": 3.71, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12348084151744843, + "learning_rate": 3.2391098708609897e-07, + "loss": -0.001, + "num_tokens": 16196124.0, + "reward": 12.726795196533203, + "reward_std": 4.499066352844238, + "rewards/fitness_reward/mean": 7.01171875, + "rewards/fitness_reward/std": 1.9768810272216797, + "rewards/kidney_reward/mean": 2.339609146118164, + "rewards/kidney_reward/std": 1.1481411457061768, + "rewards/length2tails_reward/mean": 0.7159409523010254, + "rewards/length2tails_reward/std": 0.32566535472869873, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.203874111175537, + "rewards/thermo_reward/std": 1.7565038204193115, + "step": 1855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.09375, + "completions/mean_terminated_length": 271.09375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0838960474357009, + "epoch": 3.7119999999999997, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11009681224822998, + "learning_rate": 3.229667316808907e-07, + "loss": -0.0031, + "num_tokens": 16204831.0, + "reward": 13.504732131958008, + "reward_std": 2.51141619682312, + "rewards/fitness_reward/mean": 6.986984729766846, + "rewards/fitness_reward/std": 2.116797685623169, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.6934654116630554, + "rewards/length2tails_reward/std": 0.2892797291278839, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09317321423441172, + "epoch": 3.714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10668087750673294, + "learning_rate": 3.220235894867793e-07, + "loss": 0.0006, + "num_tokens": 16213575.0, + "reward": 13.71420955657959, + "reward_std": 0.535295844078064, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7669464349746704, + "rewards/length2tails_reward/std": 0.3002374470233917, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5498507022857666, + "rewards/thermo_reward/std": 0.5360844731330872, + "step": 1857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.53125, + "completions/mean_terminated_length": 272.53125, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "entropy": 0.07743867486715317, + "epoch": 3.716, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12143448740243912, + "learning_rate": 3.2108156205452506e-07, + "loss": 0.0044, + "num_tokens": 16222328.0, + "reward": 13.630332946777344, + "reward_std": 1.0454219579696655, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8501394987106323, + "rewards/length2tails_reward/std": 0.2279629111289978, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4850144386291504, + "rewards/thermo_reward/std": 0.9117720723152161, + "step": 1858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.78125, + "completions/mean_terminated_length": 271.78125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.07970911916345358, + "epoch": 3.718, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09778325259685516, + "learning_rate": 3.2014065093305564e-07, + "loss": -0.0011, + "num_tokens": 16231057.0, + "reward": 13.690130233764648, + "reward_std": 1.086830973625183, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7724133133888245, + "rewards/length2tails_reward/std": 0.2743414640426636, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5525827407836914, + "rewards/thermo_reward/std": 0.9439334273338318, + "step": 1859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.3125, + "completions/mean_terminated_length": 271.3125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08170908875763416, + "epoch": 3.7199999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1340896338224411, + "learning_rate": 3.19200857669463e-07, + "loss": 0.0023, + "num_tokens": 16239771.0, + "reward": 13.950653076171875, + "reward_std": 0.22248882055282593, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.738226056098938, + "rewards/length2tails_reward/std": 0.25630003213882446, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7891664505004883, + "rewards/thermo_reward/std": 0.225629061460495, + "step": 1860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.125, + "completions/mean_terminated_length": 272.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.07605164684355259, + "epoch": 3.722, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9549767971038818, + "learning_rate": 3.182621838090006e-07, + "loss": -0.0006, + "num_tokens": 16248511.0, + "reward": 12.998878479003906, + "reward_std": 3.2146313190460205, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.452592670917511, + "rewards/kidney_reward/mean": 2.349088191986084, + "rewards/kidney_reward/std": 1.102823257446289, + "rewards/length2tails_reward/mean": 0.7339838743209839, + "rewards/length2tails_reward/std": 0.3027809262275696, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2302253246307373, + "rewards/thermo_reward/std": 1.777825117111206, + "step": 1861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.5625, + "completions/mean_terminated_length": 272.5625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0779388127848506, + "epoch": 3.724, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11938080191612244, + "learning_rate": 3.17324630895082e-07, + "loss": -0.0008, + "num_tokens": 16257265.0, + "reward": 13.55603313446045, + "reward_std": 1.6116957664489746, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5346274375915527, + "rewards/kidney_reward/std": 0.519594669342041, + "rewards/length2tails_reward/mean": 0.7872436046600342, + "rewards/length2tails_reward/std": 0.26303040981292725, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4814963340759277, + "rewards/thermo_reward/std": 1.1194443702697754, + "step": 1862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.21875, + "completions/mean_terminated_length": 272.21875, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "entropy": 0.08677832735702395, + "epoch": 3.726, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19907931983470917, + "learning_rate": 3.163882004692774e-07, + "loss": 0.0037, + "num_tokens": 16266008.0, + "reward": 13.88401985168457, + "reward_std": 0.37567073106765747, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8696078062057495, + "rewards/length2tails_reward/std": 0.174245685338974, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.625, + "completions/mean_terminated_length": 272.625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0809440454468131, + "epoch": 3.7279999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06956552714109421, + "learning_rate": 3.154528940713113e-07, + "loss": -0.0064, + "num_tokens": 16274764.0, + "reward": 13.794866561889648, + "reward_std": 1.1583558320999146, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8030489087104797, + "rewards/length2tails_reward/std": 0.23350834846496582, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.654256820678711, + "rewards/thermo_reward/std": 0.9887924194335938, + "step": 1864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.5625, + "completions/mean_terminated_length": 271.5625, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.08229780849069357, + "epoch": 3.73, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.110500268638134, + "learning_rate": 3.145187132390604e-07, + "loss": 0.0032, + "num_tokens": 16283486.0, + "reward": 13.691943168640137, + "reward_std": 0.9804035425186157, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7836110591888428, + "rewards/length2tails_reward/std": 0.27624985575675964, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.553276300430298, + "rewards/thermo_reward/std": 0.9403393864631653, + "step": 1865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.08790844492614269, + "epoch": 3.732, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07562534511089325, + "learning_rate": 3.1358565950854976e-07, + "loss": 0.0002, + "num_tokens": 16292234.0, + "reward": 13.780458450317383, + "reward_std": 0.8375572562217712, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8365454077720642, + "rewards/length2tails_reward/std": 0.2498382180929184, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.609139919281006, + "rewards/thermo_reward/std": 0.8389954566955566, + "step": 1866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.40625, + "completions/mean_terminated_length": 271.40625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08745685964822769, + "epoch": 3.734, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07099781185388565, + "learning_rate": 3.12653734413952e-07, + "loss": 0.0009, + "num_tokens": 16300951.0, + "reward": 13.770774841308594, + "reward_std": 0.8302472233772278, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7253307104110718, + "rewards/length2tails_reward/std": 0.27975571155548096, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.61057710647583, + "rewards/thermo_reward/std": 0.8314604163169861, + "step": 1867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.125, + "completions/mean_terminated_length": 273.125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08242936944589019, + "epoch": 3.7359999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.139634907245636, + "learning_rate": 3.11722939487584e-07, + "loss": -0.0, + "num_tokens": 16309723.0, + "reward": 13.959595680236816, + "reward_std": 0.22406338155269623, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8276500701904297, + "rewards/length2tails_reward/std": 0.21905958652496338, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7891664505004883, + "rewards/thermo_reward/std": 0.225629061460495, + "step": 1868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.59375, + "completions/mean_terminated_length": 272.59375, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "entropy": 0.08443598542362452, + "epoch": 3.738, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13019409775733948, + "learning_rate": 3.1079327625990403e-07, + "loss": 0.0008, + "num_tokens": 16318478.0, + "reward": 13.960803985595703, + "reward_std": 0.22400137782096863, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.839726448059082, + "rewards/length2tails_reward/std": 0.18779398500919342, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7891664505004883, + "rewards/thermo_reward/std": 0.225629061460495, + "step": 1869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 274.34375, + "completions/mean_terminated_length": 274.34375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08120616152882576, + "epoch": 3.74, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09351641684770584, + "learning_rate": 3.098647462595099e-07, + "loss": -0.001, + "num_tokens": 16327289.0, + "reward": 13.639875411987305, + "reward_std": 0.9192527532577515, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8695728182792664, + "rewards/length2tails_reward/std": 0.20318247377872467, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.492612838745117, + "rewards/thermo_reward/std": 0.8752034902572632, + "step": 1870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.5625, + "completions/mean_terminated_length": 271.5625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08450864860787988, + "epoch": 3.742, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08416204154491425, + "learning_rate": 3.0893735101313535e-07, + "loss": -0.0036, + "num_tokens": 16336011.0, + "reward": 13.434139251708984, + "reward_std": 1.6701217889785767, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7553468346595764, + "rewards/length2tails_reward/std": 0.2860553562641144, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.2982993125915527, + "rewards/thermo_reward/std": 1.5283452272415161, + "step": 1871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.84375, + "completions/mean_terminated_length": 273.84375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08435946051031351, + "epoch": 3.7439999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09009895473718643, + "learning_rate": 3.0801109204564926e-07, + "loss": -0.0041, + "num_tokens": 16344806.0, + "reward": 13.561756134033203, + "reward_std": 1.422103762626648, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8623050451278687, + "rewards/length2tails_reward/std": 0.20491470396518707, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.415219783782959, + "rewards/thermo_reward/std": 1.2666558027267456, + "step": 1872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.46875, + "completions/mean_terminated_length": 273.46875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 0.08299027197062969, + "epoch": 3.746, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08488171547651291, + "learning_rate": 3.070859708800515e-07, + "loss": 0.0029, + "num_tokens": 16353589.0, + "reward": 13.885295867919922, + "reward_std": 0.37565287947654724, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8823668360710144, + "rewards/length2tails_reward/std": 0.18453270196914673, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.875, + "completions/mean_terminated_length": 271.875, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "entropy": 0.08089287113398314, + "epoch": 3.748, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10338589549064636, + "learning_rate": 3.0616198903747157e-07, + "loss": 0.0039, + "num_tokens": 16362321.0, + "reward": 13.87789535522461, + "reward_std": 0.3744944632053375, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8083574771881104, + "rewards/length2tails_reward/std": 0.20511792600154877, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.125, + "completions/mean_terminated_length": 272.125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0791722135618329, + "epoch": 3.75, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11482106894254684, + "learning_rate": 3.0523914803716577e-07, + "loss": 0.0039, + "num_tokens": 16371061.0, + "reward": 13.954635620117188, + "reward_std": 0.2227708101272583, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7780531048774719, + "rewards/length2tails_reward/std": 0.2267376035451889, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7891664505004883, + "rewards/thermo_reward/std": 0.225629061460495, + "step": 1875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.0625, + "completions/mean_terminated_length": 272.0625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08384049450978637, + "epoch": 3.752, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09511395543813705, + "learning_rate": 3.043174493965136e-07, + "loss": -0.0002, + "num_tokens": 16379799.0, + "reward": 13.39472770690918, + "reward_std": 2.6052229404449463, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.4486875534057617, + "rewards/kidney_reward/std": 0.860011100769043, + "rewards/length2tails_reward/mean": 0.7839442491531372, + "rewards/length2tails_reward/std": 0.25025689601898193, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4639697074890137, + "rewards/thermo_reward/std": 1.4808531999588013, + "step": 1876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09349369630217552, + "epoch": 3.754, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05819488316774368, + "learning_rate": 3.0339689463101714e-07, + "loss": -0.0045, + "num_tokens": 16388543.0, + "reward": 13.307064056396484, + "reward_std": 3.007098436355591, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.5390896797180176, + "rewards/kidney_reward/std": 0.49435171484947205, + "rewards/length2tails_reward/mean": 0.7880429625511169, + "rewards/length2tails_reward/std": 0.24534600973129272, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.536116123199463, + "rewards/thermo_reward/std": 0.8422486782073975, + "step": 1877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.5625, + "completions/mean_terminated_length": 272.5625, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.10010077618062496, + "epoch": 3.7560000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7483545541763306, + "learning_rate": 3.0247748525429785e-07, + "loss": -0.0078, + "num_tokens": 16397297.0, + "reward": 13.707642555236816, + "reward_std": 0.6473450064659119, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.8495962619781494, + "rewards/length2tails_reward/std": 0.16216562688350677, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 1878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.96875, + "completions/mean_terminated_length": 273.96875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.0832118820399046, + "epoch": 3.758, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10160697251558304, + "learning_rate": 3.0155922277809256e-07, + "loss": 0.0002, + "num_tokens": 16406096.0, + "reward": 13.88062858581543, + "reward_std": 0.37648022174835205, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8356945514678955, + "rewards/length2tails_reward/std": 0.2622833847999573, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 273.46875, + "completions/mean_terminated_length": 273.46875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.09201773721724749, + "epoch": 3.76, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10607636719942093, + "learning_rate": 3.006421087122538e-07, + "loss": 0.0027, + "num_tokens": 16414879.0, + "reward": 13.839288711547852, + "reward_std": 0.4233863949775696, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.821160078048706, + "rewards/length2tails_reward/std": 0.23787914216518402, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.90625, + "completions/mean_terminated_length": 269.90625, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "entropy": 0.08618500549346209, + "epoch": 3.762, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11446670442819595, + "learning_rate": 2.9972614456474533e-07, + "loss": -0.0015, + "num_tokens": 16423548.0, + "reward": 13.815107345581055, + "reward_std": 0.9864696264266968, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7166304588317871, + "rewards/length2tails_reward/std": 0.2847166359424591, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.6557796001434326, + "rewards/thermo_reward/std": 0.9801791906356812, + "step": 1881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.9375, + "completions/mean_terminated_length": 273.9375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08535457588732243, + "epoch": 3.7640000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1087120845913887, + "learning_rate": 2.9881133184163944e-07, + "loss": 0.0012, + "num_tokens": 16432346.0, + "reward": 13.84271240234375, + "reward_std": 0.42505595088005066, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8553938865661621, + "rewards/length2tails_reward/std": 0.20128926634788513, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 273.28125, + "completions/mean_terminated_length": 273.28125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08107516821473837, + "epoch": 3.766, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0713000297546387, + "learning_rate": 2.978976720471161e-07, + "loss": 0.0098, + "num_tokens": 16441123.0, + "reward": 13.712474822998047, + "reward_std": 1.392941951751709, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8027397990226746, + "rewards/length2tails_reward/std": 0.23403459787368774, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5718953609466553, + "rewards/thermo_reward/std": 1.2424302101135254, + "step": 1883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.46875, + "completions/mean_terminated_length": 272.46875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08162663597613573, + "epoch": 3.768, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08860152959823608, + "learning_rate": 2.969851666834594e-07, + "loss": -0.0009, + "num_tokens": 16449874.0, + "reward": 13.41884994506836, + "reward_std": 1.3702086210250854, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7931690216064453, + "rewards/length2tails_reward/std": 0.23420800268650055, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.279226779937744, + "rewards/thermo_reward/std": 1.2637813091278076, + "step": 1884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.5625, + "completions/mean_terminated_length": 271.5625, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.09294078592211008, + "epoch": 3.77, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.45369336009025574, + "learning_rate": 2.9607381725105507e-07, + "loss": -0.0213, + "num_tokens": 16458596.0, + "reward": 13.53420639038086, + "reward_std": 1.4441404342651367, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.8011078834533691, + "rewards/length2tails_reward/std": 0.2844718396663666, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4485087394714355, + "rewards/thermo_reward/std": 1.2097461223602295, + "step": 1885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.53125, + "completions/mean_terminated_length": 272.53125, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 0.08244166569784284, + "epoch": 3.7720000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12649722397327423, + "learning_rate": 2.9516362524838847e-07, + "loss": -0.0007, + "num_tokens": 16467349.0, + "reward": 13.838593482971191, + "reward_std": 0.43084508180618286, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.814199686050415, + "rewards/length2tails_reward/std": 0.2507236897945404, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 276.78125, + "completions/mean_terminated_length": 276.78125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09738295339047909, + "epoch": 3.774, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08789598941802979, + "learning_rate": 2.942545921720412e-07, + "loss": -0.0094, + "num_tokens": 16476238.0, + "reward": 13.785650253295898, + "reward_std": 0.8378366827964783, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8598126173019409, + "rewards/length2tails_reward/std": 0.2199796885251999, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.612004280090332, + "rewards/thermo_reward/std": 0.823988139629364, + "step": 1887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.78125, + "completions/mean_terminated_length": 270.78125, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "entropy": 0.08558739256113768, + "epoch": 3.776, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1280132681131363, + "learning_rate": 2.9334671951668986e-07, + "loss": 0.0029, + "num_tokens": 16484935.0, + "reward": 13.868514060974121, + "reward_std": 0.3720110058784485, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7145437002182007, + "rewards/length2tails_reward/std": 0.2871835231781006, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.78125, + "completions/mean_terminated_length": 272.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.07987150177359581, + "epoch": 3.778, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1321270912885666, + "learning_rate": 2.924400087751031e-07, + "loss": -0.0145, + "num_tokens": 16493696.0, + "reward": 13.993861198425293, + "reward_std": 0.02611471712589264, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.771440863609314, + "rewards/length2tails_reward/std": 0.2611500322818756, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.829052448272705, + "rewards/thermo_reward/std": 0.0, + "step": 1889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.0, + "completions/mean_terminated_length": 272.0, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.06972839124500751, + "epoch": 3.7800000000000002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08080914616584778, + "learning_rate": 2.9153446143813886e-07, + "loss": -0.0049, + "num_tokens": 16502432.0, + "reward": 13.62398910522461, + "reward_std": 1.421363115310669, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.650642991065979, + "rewards/kidney_reward/mean": 2.5377397537231445, + "rewards/kidney_reward/std": 0.5019885897636414, + "rewards/length2tails_reward/mean": 0.7057427167892456, + "rewards/length2tails_reward/std": 0.3191591203212738, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.0625, + "completions/mean_terminated_length": 273.0625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.07965558161959052, + "epoch": 3.782, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20845657587051392, + "learning_rate": 2.906300789947421e-07, + "loss": 0.0013, + "num_tokens": 16511202.0, + "reward": 13.137361526489258, + "reward_std": 2.915266752243042, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.452592670917511, + "rewards/kidney_reward/mean": 2.374690055847168, + "rewards/kidney_reward/std": 0.9955797791481018, + "rewards/length2tails_reward/mean": 0.811353325843811, + "rewards/length2tails_reward/std": 0.2351706475019455, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3353686332702637, + "rewards/thermo_reward/std": 1.490159034729004, + "step": 1891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.75, + "completions/mean_terminated_length": 270.75, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.0726462290622294, + "epoch": 3.784, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10030462592840195, + "learning_rate": 2.8972686293194306e-07, + "loss": -0.0036, + "num_tokens": 16519898.0, + "reward": 13.725658416748047, + "reward_std": 1.0770522356033325, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.650642991065979, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7097656726837158, + "rewards/length2tails_reward/std": 0.3195935785770416, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.0625, + "completions/mean_terminated_length": 272.0625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08478438295423985, + "epoch": 3.786, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08734481781721115, + "learning_rate": 2.8882481473485276e-07, + "loss": -0.0004, + "num_tokens": 16528636.0, + "reward": 13.870732307434082, + "reward_std": 0.38143596053123474, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7367347478866577, + "rewards/length2tails_reward/std": 0.2800016701221466, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.5625, + "completions/mean_terminated_length": 270.5625, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.08649180363863707, + "epoch": 3.7880000000000003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06724435091018677, + "learning_rate": 2.879239358866632e-07, + "loss": -0.0037, + "num_tokens": 16537326.0, + "reward": 13.382439613342285, + "reward_std": 3.0069379806518555, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.5390896797180176, + "rewards/kidney_reward/std": 0.49435171484947205, + "rewards/length2tails_reward/mean": 0.74407958984375, + "rewards/length2tails_reward/std": 0.2597999572753906, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.6158881187438965, + "rewards/thermo_reward/std": 0.8037141561508179, + "step": 1894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 286.8125, + "completions/mean_terminated_length": 271.7419128417969, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08883518259972334, + "epoch": 3.79, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33261749148368835, + "learning_rate": 2.870242278686432e-07, + "loss": -0.0576, + "num_tokens": 16546536.0, + "reward": 13.99350357055664, + "reward_std": 0.02659435383975506, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7678635120391846, + "rewards/length2tails_reward/std": 0.2659464478492737, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.829052448272705, + "rewards/thermo_reward/std": 0.0, + "step": 1895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.78125, + "completions/mean_terminated_length": 271.78125, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "entropy": 0.09447172377258539, + "epoch": 3.792, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25320422649383545, + "learning_rate": 2.861256921601367e-07, + "loss": 0.0054, + "num_tokens": 16555265.0, + "reward": 13.790377616882324, + "reward_std": 0.943525493144989, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7760162949562073, + "rewards/length2tails_reward/std": 0.2553281784057617, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.652470111846924, + "rewards/thermo_reward/std": 0.7984983921051025, + "step": 1896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.4375, + "completions/mean_terminated_length": 272.4375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.07667552959173918, + "epoch": 3.794, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09639758616685867, + "learning_rate": 2.852283302385602e-07, + "loss": -0.0021, + "num_tokens": 16564015.0, + "reward": 13.875651359558105, + "reward_std": 0.3780142664909363, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7859266996383667, + "rewards/length2tails_reward/std": 0.2399982511997223, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.40625, + "completions/mean_terminated_length": 272.40625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.07352221850305796, + "epoch": 3.7960000000000003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08981627225875854, + "learning_rate": 2.8433214357939917e-07, + "loss": -0.0082, + "num_tokens": 16572764.0, + "reward": 13.6255521774292, + "reward_std": 1.1251661777496338, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7463036775588989, + "rewards/length2tails_reward/std": 0.310706228017807, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5179758071899414, + "rewards/thermo_reward/std": 0.9320968985557556, + "step": 1898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 272.59375, + "completions/mean_terminated_length": 272.59375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08282679691910744, + "epoch": 3.798, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08296433836221695, + "learning_rate": 2.834371336562077e-07, + "loss": -0.0004, + "num_tokens": 16581519.0, + "reward": 13.956785202026367, + "reward_std": 0.22445662319660187, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7995490431785583, + "rewards/length2tails_reward/std": 0.24117223918437958, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7891664505004883, + "rewards/thermo_reward/std": 0.225629061460495, + "step": 1899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.28125, + "completions/mean_terminated_length": 270.28125, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.07467029197141528, + "epoch": 3.8, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07870402932167053, + "learning_rate": 2.8254330194060515e-07, + "loss": -0.0, + "num_tokens": 16590200.0, + "reward": 13.618962287902832, + "reward_std": 1.8877650499343872, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.526204824447632, + "rewards/kidney_reward/std": 0.5672398209571838, + "rewards/length2tails_reward/mean": 0.7450594902038574, + "rewards/length2tails_reward/std": 0.2946070730686188, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.6145756244659424, + "rewards/thermo_reward/std": 1.0059599876403809, + "step": 1900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.09375, + "completions/mean_terminated_length": 272.09375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08942047785967588, + "epoch": 3.802, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9487526416778564, + "learning_rate": 2.816506499022725e-07, + "loss": 0.0028, + "num_tokens": 16598939.0, + "reward": 13.454952239990234, + "reward_std": 2.020958423614502, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.506537437438965, + "rewards/kidney_reward/std": 0.5413090586662292, + "rewards/length2tails_reward/mean": 0.7707405090332031, + "rewards/length2tails_reward/std": 0.22672009468078613, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.467665195465088, + "rewards/thermo_reward/std": 1.1921191215515137, + "step": 1901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.21875, + "completions/mean_terminated_length": 272.21875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08338914159685373, + "epoch": 3.8040000000000003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08682217448949814, + "learning_rate": 2.807591790089521e-07, + "loss": -0.0004, + "num_tokens": 16607682.0, + "reward": 13.701095581054688, + "reward_std": 1.0283238887786865, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7714388966560364, + "rewards/length2tails_reward/std": 0.23634468019008636, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.536287307739258, + "rewards/thermo_reward/std": 1.029089331626892, + "step": 1902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.15625, + "completions/mean_terminated_length": 273.15625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.0814851438626647, + "epoch": 3.806, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10658857971429825, + "learning_rate": 2.79868890726444e-07, + "loss": 0.0046, + "num_tokens": 16616455.0, + "reward": 13.840827941894531, + "reward_std": 0.42394959926605225, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8365510702133179, + "rewards/length2tails_reward/std": 0.21344824135303497, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.46875, + "completions/mean_terminated_length": 273.46875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.083722242154181, + "epoch": 3.808, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07940282672643661, + "learning_rate": 2.789797865186032e-07, + "loss": -0.0018, + "num_tokens": 16625238.0, + "reward": 13.84145736694336, + "reward_std": 0.4304184019565582, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8428469896316528, + "rewards/length2tails_reward/std": 0.17144176363945007, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.71875, + "completions/mean_terminated_length": 272.71875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08519653510302305, + "epoch": 3.81, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21932841837406158, + "learning_rate": 2.7809186784733827e-07, + "loss": 0.0007, + "num_tokens": 16633997.0, + "reward": 13.91900634765625, + "reward_std": 0.3112717568874359, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8206124305725098, + "rewards/length2tails_reward/std": 0.23140211403369904, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08144298056140542, + "epoch": 3.8120000000000003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06131415069103241, + "learning_rate": 2.7720513617260855e-07, + "loss": -0.0029, + "num_tokens": 16642721.0, + "reward": 13.456586837768555, + "reward_std": 1.9371612071990967, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.533867359161377, + "rewards/kidney_reward/std": 0.523894190788269, + "rewards/length2tails_reward/mean": 0.7181217670440674, + "rewards/length2tails_reward/std": 0.2754988372325897, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3897223472595215, + "rewards/thermo_reward/std": 1.4708243608474731, + "step": 1906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.84375, + "completions/mean_terminated_length": 272.84375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.0787704661488533, + "epoch": 3.814, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08822289109230042, + "learning_rate": 2.7631959295242124e-07, + "loss": -0.002, + "num_tokens": 16651484.0, + "reward": 13.826042175292969, + "reward_std": 0.9932739734649658, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8458192348480225, + "rewards/length2tails_reward/std": 0.16225025057792664, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.6811540126800537, + "rewards/thermo_reward/std": 0.836639404296875, + "step": 1907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.0, + "completions/mean_terminated_length": 272.0, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0801471695303917, + "epoch": 3.816, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3141650855541229, + "learning_rate": 2.754352396428302e-07, + "loss": -0.0033, + "num_tokens": 16660220.0, + "reward": 13.208551406860352, + "reward_std": 4.000711441040039, + "rewards/fitness_reward/mean": 7.050926208496094, + "rewards/fitness_reward/std": 1.7550898790359497, + "rewards/kidney_reward/mean": 2.4628095626831055, + "rewards/kidney_reward/std": 0.9258583188056946, + "rewards/length2tails_reward/mean": 0.7736960053443909, + "rewards/length2tails_reward/std": 0.26007622480392456, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5174460411071777, + "rewards/thermo_reward/std": 1.3342645168304443, + "step": 1908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.5625, + "completions/mean_terminated_length": 272.5625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.07614208897575736, + "epoch": 3.818, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14567843079566956, + "learning_rate": 2.7455207769793153e-07, + "loss": 0.0002, + "num_tokens": 16668974.0, + "reward": 13.886138916015625, + "reward_std": 0.4331798255443573, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7655285596847534, + "rewards/length2tails_reward/std": 0.2807142436504364, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.65625, + "completions/mean_terminated_length": 272.65625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08579348865896463, + "epoch": 3.82, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08240564912557602, + "learning_rate": 2.736701085698635e-07, + "loss": 0.0007, + "num_tokens": 16677731.0, + "reward": 13.637163162231445, + "reward_std": 1.3797341585159302, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7908780574798584, + "rewards/length2tails_reward/std": 0.271894633769989, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.497770071029663, + "rewards/thermo_reward/std": 1.2343865633010864, + "step": 1910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.0625, + "completions/mean_terminated_length": 273.0625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08684431295841932, + "epoch": 3.822, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07356935739517212, + "learning_rate": 2.7278933370880263e-07, + "loss": -0.0037, + "num_tokens": 16686501.0, + "reward": 13.682441711425781, + "reward_std": 1.0081303119659424, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8031569719314575, + "rewards/length2tails_reward/std": 0.2724221348762512, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.541821002960205, + "rewards/thermo_reward/std": 1.0000317096710205, + "step": 1911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.28125, + "completions/mean_terminated_length": 272.28125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08624691423028708, + "epoch": 3.824, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10585896670818329, + "learning_rate": 2.7190975456296193e-07, + "loss": 0.0025, + "num_tokens": 16695246.0, + "reward": 13.877214431762695, + "reward_std": 0.37398141622543335, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8015420436859131, + "rewards/length2tails_reward/std": 0.22604189813137054, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.96875, + "completions/mean_terminated_length": 269.96875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.07903222367167473, + "epoch": 3.826, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13978196680545807, + "learning_rate": 2.7103137257858863e-07, + "loss": -0.001, + "num_tokens": 16703917.0, + "reward": 13.721193313598633, + "reward_std": 0.9134910702705383, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7433258295059204, + "rewards/length2tails_reward/std": 0.2548017203807831, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5591952800750732, + "rewards/thermo_reward/std": 0.909770131111145, + "step": 1913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.5625, + "completions/mean_terminated_length": 270.5625, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.09199087880551815, + "epoch": 3.828, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11646510660648346, + "learning_rate": 2.7015418919996057e-07, + "loss": 0.004, + "num_tokens": 16712607.0, + "reward": 13.66675090789795, + "reward_std": 1.4613165855407715, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.528264045715332, + "rewards/kidney_reward/std": 0.5555903911590576, + "rewards/length2tails_reward/mean": 0.8499467372894287, + "rewards/length2tails_reward/std": 0.18801383674144745, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5923073291778564, + "rewards/thermo_reward/std": 0.927997350692749, + "step": 1914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.8125, + "completions/mean_terminated_length": 271.8125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08600977715104818, + "epoch": 3.83, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15362295508384705, + "learning_rate": 2.6927820586938576e-07, + "loss": 0.0015, + "num_tokens": 16721337.0, + "reward": 13.803441047668457, + "reward_std": 0.5228808522224426, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7362752556800842, + "rewards/length2tails_reward/std": 0.2287837564945221, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.40625, + "completions/mean_terminated_length": 273.40625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08416588045656681, + "epoch": 3.832, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09511641412973404, + "learning_rate": 2.684034240271986e-07, + "loss": -0.0012, + "num_tokens": 16730118.0, + "reward": 13.813632011413574, + "reward_std": 0.5235826969146729, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8381756544113159, + "rewards/length2tails_reward/std": 0.23721517622470856, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.90625, + "completions/mean_terminated_length": 272.90625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.07561897998675704, + "epoch": 3.834, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06131261587142944, + "learning_rate": 2.6752984511175814e-07, + "loss": -0.0001, + "num_tokens": 16738883.0, + "reward": 13.876596450805664, + "reward_std": 0.37739771604537964, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7953763008117676, + "rewards/length2tails_reward/std": 0.2556256949901581, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.28125, + "completions/mean_terminated_length": 272.28125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08190339151769876, + "epoch": 3.836, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10995667427778244, + "learning_rate": 2.6665747055944553e-07, + "loss": 0.0029, + "num_tokens": 16747628.0, + "reward": 13.7941255569458, + "reward_std": 0.46538564562797546, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7683846950531006, + "rewards/length2tails_reward/std": 0.2302393913269043, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.4375, + "completions/mean_terminated_length": 272.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08025580272078514, + "epoch": 3.838, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0882352814078331, + "learning_rate": 2.657863018046611e-07, + "loss": -0.0039, + "num_tokens": 16756378.0, + "reward": 13.775444030761719, + "reward_std": 0.8556699156761169, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8053842782974243, + "rewards/length2tails_reward/std": 0.24007809162139893, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.60724139213562, + "rewards/thermo_reward/std": 0.8489658236503601, + "step": 1919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.90625, + "completions/mean_terminated_length": 272.90625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0824215766042471, + "epoch": 3.84, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15367527306079865, + "learning_rate": 2.6491634027982324e-07, + "loss": 0.0017, + "num_tokens": 16765143.0, + "reward": 13.848047256469727, + "reward_std": 0.4790394902229309, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7834751009941101, + "rewards/length2tails_reward/std": 0.2990477383136749, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.96875, + "completions/mean_terminated_length": 272.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08515264093875885, + "epoch": 3.842, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0763060674071312, + "learning_rate": 2.64047587415365e-07, + "loss": 0.0018, + "num_tokens": 16773910.0, + "reward": 13.881507873535156, + "reward_std": 0.37522459030151367, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8444912433624268, + "rewards/length2tails_reward/std": 0.1953170895576477, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 625.0, + "completions/max_terminated_length": 625.0, + "completions/mean_length": 283.1875, + "completions/mean_terminated_length": 283.1875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0946641843765974, + "epoch": 3.844, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3928823173046112, + "learning_rate": 2.6318004463973233e-07, + "loss": -0.0102, + "num_tokens": 16783004.0, + "reward": 13.092012405395508, + "reward_std": 4.512775421142578, + "rewards/fitness_reward/mean": 6.999444007873535, + "rewards/fitness_reward/std": 2.046316623687744, + "rewards/kidney_reward/mean": 2.412200927734375, + "rewards/kidney_reward/std": 1.0636920928955078, + "rewards/length2tails_reward/mean": 0.7764885425567627, + "rewards/length2tails_reward/std": 0.2910473942756653, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.502717971801758, + "rewards/thermo_reward/std": 1.4153804779052734, + "step": 1922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.96875, + "completions/mean_terminated_length": 270.96875, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.08065614709630609, + "epoch": 3.846, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08240630477666855, + "learning_rate": 2.6231371337938144e-07, + "loss": -0.0057, + "num_tokens": 16791707.0, + "reward": 13.549034118652344, + "reward_std": 1.318332552909851, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7552239894866943, + "rewards/length2tails_reward/std": 0.3136855959892273, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3858466148376465, + "rewards/thermo_reward/std": 1.3002216815948486, + "step": 1923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.21875, + "completions/mean_terminated_length": 272.21875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08426021132618189, + "epoch": 3.848, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12145797163248062, + "learning_rate": 2.6144859505877603e-07, + "loss": -0.0031, + "num_tokens": 16800450.0, + "reward": 13.404792785644531, + "reward_std": 2.175577163696289, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.510470390319824, + "rewards/kidney_reward/std": 0.6562471985816956, + "rewards/length2tails_reward/mean": 0.7557017803192139, + "rewards/length2tails_reward/std": 0.27980005741119385, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4150757789611816, + "rewards/thermo_reward/std": 1.337871789932251, + "step": 1924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 276.5, + "completions/mean_terminated_length": 276.5, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.10557070840150118, + "epoch": 3.85, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2935408651828766, + "learning_rate": 2.6058469110038626e-07, + "loss": -0.0007, + "num_tokens": 16809330.0, + "reward": 13.773045539855957, + "reward_std": 0.5495082139968872, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8311747312545776, + "rewards/length2tails_reward/std": 0.23529881238937378, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.125, + "completions/mean_terminated_length": 273.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08785425685346127, + "epoch": 3.852, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11054041981697083, + "learning_rate": 2.597220029246846e-07, + "loss": -0.0045, + "num_tokens": 16818102.0, + "reward": 13.029808044433594, + "reward_std": 2.00207781791687, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5444021224975586, + "rewards/kidney_reward/std": 0.2592725455760956, + "rewards/length2tails_reward/mean": 0.8216935396194458, + "rewards/length2tails_reward/std": 0.22593268752098083, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 2.942051410675049, + "rewards/thermo_reward/std": 1.8460910320281982, + "step": 1926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.875, + "completions/mean_terminated_length": 271.875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.07975840009748936, + "epoch": 3.854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12637239694595337, + "learning_rate": 2.5886053195014534e-07, + "loss": -0.0028, + "num_tokens": 16826834.0, + "reward": 13.594480514526367, + "reward_std": 1.4269717931747437, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7686116695404053, + "rewards/length2tails_reward/std": 0.2542043924331665, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4573142528533936, + "rewards/thermo_reward/std": 1.3170541524887085, + "step": 1927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 272.90625, + "completions/mean_terminated_length": 272.90625, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "entropy": 0.07655230350792408, + "epoch": 3.856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17291785776615143, + "learning_rate": 2.5800027959324087e-07, + "loss": 0.0039, + "num_tokens": 16835599.0, + "reward": 13.932228088378906, + "reward_std": 0.3779231905937195, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8275623321533203, + "rewards/length2tails_reward/std": 0.2156936079263687, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7891664505004883, + "rewards/thermo_reward/std": 0.225629061460495, + "step": 1928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.21875, + "completions/mean_terminated_length": 273.21875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08698383998125792, + "epoch": 3.858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10282512754201889, + "learning_rate": 2.571412472684401e-07, + "loss": 0.0015, + "num_tokens": 16844374.0, + "reward": 13.840457916259766, + "reward_std": 0.43196240067481995, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8328460454940796, + "rewards/length2tails_reward/std": 0.19952915608882904, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.0, + "completions/mean_terminated_length": 273.0, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.07852550689131021, + "epoch": 3.86, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08711478114128113, + "learning_rate": 2.5628343638820625e-07, + "loss": -0.0042, + "num_tokens": 16853142.0, + "reward": 13.91780948638916, + "reward_std": 0.3206358551979065, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8086421489715576, + "rewards/length2tails_reward/std": 0.24440136551856995, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.625, + "completions/mean_terminated_length": 272.625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08650944288820028, + "epoch": 3.862, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13395126163959503, + "learning_rate": 2.554268483629931e-07, + "loss": -0.0035, + "num_tokens": 16861898.0, + "reward": 13.796703338623047, + "reward_std": 0.4803380072116852, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7941558361053467, + "rewards/length2tails_reward/std": 0.27170467376708984, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 271.28125, + "completions/mean_terminated_length": 271.28125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08016407676041126, + "epoch": 3.864, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11705422401428223, + "learning_rate": 2.5457148460124476e-07, + "loss": -0.0011, + "num_tokens": 16870611.0, + "reward": 13.907598495483398, + "reward_std": 0.31562539935112, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7065348625183105, + "rewards/length2tails_reward/std": 0.28879106044769287, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08776333183050156, + "epoch": 3.866, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07915785908699036, + "learning_rate": 2.5371734650939204e-07, + "loss": -0.0035, + "num_tokens": 16879332.0, + "reward": 13.870550155639648, + "reward_std": 0.38491547107696533, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7349153757095337, + "rewards/length2tails_reward/std": 0.2707245945930481, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.625, + "completions/mean_terminated_length": 273.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.07789402874186635, + "epoch": 3.868, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1011756882071495, + "learning_rate": 2.528644354918503e-07, + "loss": 0.0028, + "num_tokens": 16888120.0, + "reward": 13.960830688476562, + "reward_std": 0.22377075254917145, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8399984240531921, + "rewards/length2tails_reward/std": 0.21737302839756012, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7891664505004883, + "rewards/thermo_reward/std": 0.225629061460495, + "step": 1934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.90625, + "completions/mean_terminated_length": 271.90625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08203192008659244, + "epoch": 3.87, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08774164319038391, + "learning_rate": 2.5201275295101775e-07, + "loss": 0.0019, + "num_tokens": 16896853.0, + "reward": 13.91592788696289, + "reward_std": 0.3097189962863922, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7898266315460205, + "rewards/length2tails_reward/std": 0.19600234925746918, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 272.09375, + "completions/mean_terminated_length": 272.09375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08788663055747747, + "epoch": 3.872, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10273593664169312, + "learning_rate": 2.511623002872718e-07, + "loss": 0.0005, + "num_tokens": 16905592.0, + "reward": 13.651347160339355, + "reward_std": 1.1073802709579468, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.803066611289978, + "rewards/length2tails_reward/std": 0.2153354436159134, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.510735511779785, + "rewards/thermo_reward/std": 0.968666672706604, + "step": 1936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.875, + "completions/mean_terminated_length": 270.875, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "entropy": 0.08587850071489811, + "epoch": 3.874, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15995830297470093, + "learning_rate": 2.5031307889896847e-07, + "loss": 0.0002, + "num_tokens": 16914292.0, + "reward": 13.452787399291992, + "reward_std": 1.6182825565338135, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7807340025901794, + "rewards/length2tails_reward/std": 0.25488072633743286, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3992772102355957, + "rewards/thermo_reward/std": 1.2482622861862183, + "step": 1937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.15625, + "completions/mean_terminated_length": 271.15625, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.08146081399172544, + "epoch": 3.876, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11728298664093018, + "learning_rate": 2.494650901824389e-07, + "loss": -0.0016, + "num_tokens": 16923001.0, + "reward": 13.694450378417969, + "reward_std": 1.0769379138946533, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.789922833442688, + "rewards/length2tails_reward/std": 0.3011004328727722, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.555152177810669, + "rewards/thermo_reward/std": 0.9306294918060303, + "step": 1938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 272.15625, + "completions/mean_terminated_length": 272.15625, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.08420975971966982, + "epoch": 3.878, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20011083781719208, + "learning_rate": 2.486183355319875e-07, + "loss": -0.0079, + "num_tokens": 16931742.0, + "reward": 13.739608764648438, + "reward_std": 0.861034631729126, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8160064220428467, + "rewards/length2tails_reward/std": 0.20948392152786255, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.570343494415283, + "rewards/thermo_reward/std": 0.852788507938385, + "step": 1939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.375, + "completions/mean_terminated_length": 273.375, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.08624033536761999, + "epoch": 3.88, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10202612727880478, + "learning_rate": 2.4777281633988976e-07, + "loss": 0.0011, + "num_tokens": 16940522.0, + "reward": 13.691545486450195, + "reward_std": 0.9300704002380371, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8535068035125732, + "rewards/length2tails_reward/std": 0.1830395758152008, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5185298919677734, + "rewards/thermo_reward/std": 0.929313600063324, + "step": 1940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 273.625, + "completions/mean_terminated_length": 273.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.07235424825921655, + "epoch": 3.882, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10096503794193268, + "learning_rate": 2.4692853399638913e-07, + "loss": -0.0021, + "num_tokens": 16949310.0, + "reward": 13.85297966003418, + "reward_std": 0.48421013355255127, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8328012228012085, + "rewards/length2tails_reward/std": 0.23313400149345398, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.46875, + "completions/mean_terminated_length": 272.46875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08609567582607269, + "epoch": 3.884, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06079360470175743, + "learning_rate": 2.4608548988969593e-07, + "loss": -0.0058, + "num_tokens": 16958061.0, + "reward": 13.425346374511719, + "reward_std": 3.0079617500305176, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.5390896797180176, + "rewards/kidney_reward/std": 0.49435171484947205, + "rewards/length2tails_reward/mean": 0.7742855548858643, + "rewards/length2tails_reward/std": 0.2837405800819397, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.6557741165161133, + "rewards/thermo_reward/std": 0.7805873155593872, + "step": 1942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.4375, + "completions/mean_terminated_length": 272.4375, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "entropy": 0.07651135697960854, + "epoch": 3.886, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08636076003313065, + "learning_rate": 2.452436854059843e-07, + "loss": -0.0013, + "num_tokens": 16966811.0, + "reward": 13.842947006225586, + "reward_std": 0.43455979228019714, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8577354550361633, + "rewards/length2tails_reward/std": 0.1791587769985199, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.78125, + "completions/mean_terminated_length": 272.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08422570209950209, + "epoch": 3.888, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15154993534088135, + "learning_rate": 2.4440312192939037e-07, + "loss": -0.0, + "num_tokens": 16975572.0, + "reward": 12.78995418548584, + "reward_std": 3.594736337661743, + "rewards/fitness_reward/mean": 7.188657760620117, + "rewards/fitness_reward/std": 0.7179933190345764, + "rewards/kidney_reward/mean": 2.3057146072387695, + "rewards/kidney_reward/std": 1.1698397397994995, + "rewards/length2tails_reward/mean": 0.7942888140678406, + "rewards/length2tails_reward/std": 0.27767103910446167, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1161532402038574, + "rewards/thermo_reward/std": 1.8846759796142578, + "step": 1944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.0, + "completions/mean_terminated_length": 271.0, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.07934307120740414, + "epoch": 3.89, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16269008815288544, + "learning_rate": 2.435638008420098e-07, + "loss": 0.0047, + "num_tokens": 16984276.0, + "reward": 13.656649589538574, + "reward_std": 1.0255653858184814, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7091237902641296, + "rewards/length2tails_reward/std": 0.2751384675502777, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5254311561584473, + "rewards/thermo_reward/std": 0.8948387503623962, + "step": 1945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 733.0, + "completions/max_terminated_length": 733.0, + "completions/mean_length": 287.40625, + "completions/mean_terminated_length": 287.40625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09849395509809256, + "epoch": 3.892, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30336740612983704, + "learning_rate": 2.4272572352389485e-07, + "loss": -0.0189, + "num_tokens": 16993505.0, + "reward": 13.672941207885742, + "reward_std": 1.2645598649978638, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7417504787445068, + "rewards/length2tails_reward/std": 0.3114211857318878, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5384607315063477, + "rewards/thermo_reward/std": 1.2190366983413696, + "step": 1946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.125, + "completions/mean_terminated_length": 272.125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.0812322972342372, + "epoch": 3.894, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11934258043766022, + "learning_rate": 2.41888891353053e-07, + "loss": -0.0014, + "num_tokens": 17002245.0, + "reward": 13.91276741027832, + "reward_std": 0.31673482060432434, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7582290172576904, + "rewards/length2tails_reward/std": 0.23247385025024414, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.0625, + "completions/mean_terminated_length": 273.0625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08449868764728308, + "epoch": 3.896, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11675684154033661, + "learning_rate": 2.410533057054446e-07, + "loss": -0.0017, + "num_tokens": 17011015.0, + "reward": 13.878592491149902, + "reward_std": 0.37778231501579285, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8153381943702698, + "rewards/length2tails_reward/std": 0.26463058590888977, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.15625, + "completions/mean_terminated_length": 271.15625, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "entropy": 0.0786692202091217, + "epoch": 3.898, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12710154056549072, + "learning_rate": 2.4021896795498044e-07, + "loss": 0.004, + "num_tokens": 17019724.0, + "reward": 13.421014785766602, + "reward_std": 1.9942625761032104, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.4906506538391113, + "rewards/kidney_reward/std": 0.6279568672180176, + "rewards/length2tails_reward/mean": 0.7816104888916016, + "rewards/length2tails_reward/std": 0.2606870234012604, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4485268592834473, + "rewards/thermo_reward/std": 1.0938901901245117, + "step": 1949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.46875, + "completions/mean_terminated_length": 272.46875, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "entropy": 0.08455433277413249, + "epoch": 3.9, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12284710258245468, + "learning_rate": 2.3938587947351917e-07, + "loss": 0.004, + "num_tokens": 17028475.0, + "reward": 13.583101272583008, + "reward_std": 1.096427083015442, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8623867034912109, + "rewards/length2tails_reward/std": 0.21832647919654846, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.436556816101074, + "rewards/thermo_reward/std": 0.9650663733482361, + "step": 1950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.0, + "completions/mean_terminated_length": 273.0, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.090028902515769, + "epoch": 3.902, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09488416463136673, + "learning_rate": 2.3855404163086556e-07, + "loss": -0.0075, + "num_tokens": 17037243.0, + "reward": 13.706623077392578, + "reward_std": 1.1716996431350708, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8330482244491577, + "rewards/length2tails_reward/std": 0.2136765569448471, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5356531143188477, + "rewards/thermo_reward/std": 1.1550958156585693, + "step": 1951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 273.71875, + "completions/mean_terminated_length": 273.71875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.0850351257249713, + "epoch": 3.904, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14141486585140228, + "learning_rate": 2.3772345579476816e-07, + "loss": -0.0007, + "num_tokens": 17046034.0, + "reward": 13.774679183959961, + "reward_std": 0.5537977814674377, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8475044369697571, + "rewards/length2tails_reward/std": 0.1927335411310196, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 271.75, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.07830340228974819, + "epoch": 3.906, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0867738425731659, + "learning_rate": 2.3689412333091618e-07, + "loss": 0.0036, + "num_tokens": 17054762.0, + "reward": 13.950662612915039, + "reward_std": 0.22277358174324036, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7383153438568115, + "rewards/length2tails_reward/std": 0.2902863621711731, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7891664505004883, + "rewards/thermo_reward/std": 0.225629061460495, + "step": 1953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.07980364747345448, + "epoch": 3.908, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13662275671958923, + "learning_rate": 2.3606604560293875e-07, + "loss": 0.0002, + "num_tokens": 17063483.0, + "reward": 13.750734329223633, + "reward_std": 0.9528560638427734, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7522097826004028, + "rewards/length2tails_reward/std": 0.2517887353897095, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.6152076721191406, + "rewards/thermo_reward/std": 0.807259738445282, + "step": 1954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.15625, + "completions/mean_terminated_length": 271.15625, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.08249637857079506, + "epoch": 3.91, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09899752587080002, + "learning_rate": 2.352392239724016e-07, + "loss": 0.0014, + "num_tokens": 17072192.0, + "reward": 13.777750015258789, + "reward_std": 0.8359129428863525, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8064588308334351, + "rewards/length2tails_reward/std": 0.2694641649723053, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.6094391345977783, + "rewards/thermo_reward/std": 0.8374254107475281, + "step": 1955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.09375, + "completions/mean_terminated_length": 273.09375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.07944468408823013, + "epoch": 3.912, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1025310754776001, + "learning_rate": 2.3441365979880522e-07, + "loss": 0.002, + "num_tokens": 17080963.0, + "reward": 13.541351318359375, + "reward_std": 1.7019767761230469, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5238256454467773, + "rewards/kidney_reward/std": 0.5806989669799805, + "rewards/length2tails_reward/mean": 0.8096390962600708, + "rewards/length2tails_reward/std": 0.2314687967300415, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4753763675689697, + "rewards/thermo_reward/std": 1.15151047706604, + "step": 1956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08525500353425741, + "epoch": 3.914, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13864275813102722, + "learning_rate": 2.335893544395826e-07, + "loss": -0.0004, + "num_tokens": 17089687.0, + "reward": 13.91266918182373, + "reward_std": 0.31485801935195923, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7572414875030518, + "rewards/length2tails_reward/std": 0.22235172986984253, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.34375, + "completions/mean_terminated_length": 271.34375, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.08531990181654692, + "epoch": 3.916, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08629897981882095, + "learning_rate": 2.3276630925009632e-07, + "loss": 0.0032, + "num_tokens": 17098402.0, + "reward": 13.874603271484375, + "reward_std": 0.37391039729118347, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7754376530647278, + "rewards/length2tails_reward/std": 0.27152007818222046, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.07741576712578535, + "epoch": 3.918, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11358784884214401, + "learning_rate": 2.3194452558363776e-07, + "loss": 0.0015, + "num_tokens": 17107137.0, + "reward": 13.614516258239746, + "reward_std": 1.0182923078536987, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8247540593147278, + "rewards/length2tails_reward/std": 0.23363043367862701, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4717354774475098, + "rewards/thermo_reward/std": 0.9769327640533447, + "step": 1959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.9375, + "completions/mean_terminated_length": 273.9375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08296280819922686, + "epoch": 3.92, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0738103836774826, + "learning_rate": 2.3112400479142347e-07, + "loss": -0.0021, + "num_tokens": 17115935.0, + "reward": 13.73696231842041, + "reward_std": 0.8988210558891296, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8741310834884644, + "rewards/length2tails_reward/std": 0.19006100296974182, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5618844032287598, + "rewards/thermo_reward/std": 0.8959512710571289, + "step": 1960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 274.09375, + "completions/mean_terminated_length": 274.09375, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.08895572554320097, + "epoch": 3.922, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16806237399578094, + "learning_rate": 2.3030474822259394e-07, + "loss": -0.0007, + "num_tokens": 17124738.0, + "reward": 13.75033950805664, + "reward_std": 1.0203158855438232, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.877525806427002, + "rewards/length2tails_reward/std": 0.18097041547298431, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.6022818088531494, + "rewards/thermo_reward/std": 0.8750991225242615, + "step": 1961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.125, + "completions/mean_terminated_length": 272.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08735064696520567, + "epoch": 3.924, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0669313445687294, + "learning_rate": 2.2948675722421085e-07, + "loss": 0.0011, + "num_tokens": 17133478.0, + "reward": 13.766349792480469, + "reward_std": 0.8892486095428467, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7945989966392517, + "rewards/length2tails_reward/std": 0.2241227924823761, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5992252826690674, + "rewards/thermo_reward/std": 0.8912632465362549, + "step": 1962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.53125, + "completions/mean_terminated_length": 272.53125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08859865088015795, + "epoch": 3.926, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09937973320484161, + "learning_rate": 2.2867003314125443e-07, + "loss": -0.0028, + "num_tokens": 17142231.0, + "reward": 13.879058837890625, + "reward_std": 0.384976863861084, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.820002555847168, + "rewards/length2tails_reward/std": 0.21553631126880646, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.71875, + "completions/mean_terminated_length": 273.71875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08682218100875616, + "epoch": 3.928, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08087195456027985, + "learning_rate": 2.278545773166225e-07, + "loss": -0.0032, + "num_tokens": 17151022.0, + "reward": 13.221725463867188, + "reward_std": 3.055856227874756, + "rewards/fitness_reward/mean": 6.987466335296631, + "rewards/fitness_reward/std": 2.11407208442688, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8249701857566833, + "rewards/length2tails_reward/std": 0.2575210630893707, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4526407718658447, + "rewards/thermo_reward/std": 0.8887622356414795, + "step": 1964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.10153763461858034, + "epoch": 3.93, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1791462004184723, + "learning_rate": 2.2704039109112716e-07, + "loss": 0.0009, + "num_tokens": 17159757.0, + "reward": 13.688663482666016, + "reward_std": 1.031781554222107, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.854976236820221, + "rewards/length2tails_reward/std": 0.18852630257606506, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5702195167541504, + "rewards/thermo_reward/std": 0.8534172773361206, + "step": 1965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08356020133942366, + "epoch": 3.932, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09256550669670105, + "learning_rate": 2.262274758034931e-07, + "loss": 0.0042, + "num_tokens": 17168515.0, + "reward": 13.879671096801758, + "reward_std": 0.3743593692779541, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8261242508888245, + "rewards/length2tails_reward/std": 0.181783989071846, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.96875, + "completions/mean_terminated_length": 272.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08657645247876644, + "epoch": 3.934, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11939027160406113, + "learning_rate": 2.254158327903557e-07, + "loss": 0.002, + "num_tokens": 17177282.0, + "reward": 13.892132759094238, + "reward_std": 0.43261972069740295, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8254703283309937, + "rewards/length2tails_reward/std": 0.2279188632965088, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.6875, + "completions/mean_terminated_length": 271.6875, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "entropy": 0.08298607263714075, + "epoch": 3.936, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08153997361660004, + "learning_rate": 2.246054633862575e-07, + "loss": -0.0006, + "num_tokens": 17186008.0, + "reward": 13.959012031555176, + "reward_std": 0.22808024287223816, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.821814775466919, + "rewards/length2tails_reward/std": 0.22938142716884613, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7891664505004883, + "rewards/thermo_reward/std": 0.225629061460495, + "step": 1968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.90625, + "completions/mean_terminated_length": 270.90625, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "entropy": 0.08575815940275788, + "epoch": 3.9379999999999997, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11718124151229858, + "learning_rate": 2.2379636892364717e-07, + "loss": 0.0043, + "num_tokens": 17194709.0, + "reward": 13.644588470458984, + "reward_std": 1.0277619361877441, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.5717613697052, + "rewards/kidney_reward/std": 0.2153141349554062, + "rewards/length2tails_reward/mean": 0.7811826467514038, + "rewards/length2tails_reward/std": 0.28118860721588135, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.53352427482605, + "rewards/thermo_reward/std": 0.8549103736877441, + "step": 1969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 276.34375, + "completions/mean_terminated_length": 276.34375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08746970538049936, + "epoch": 3.94, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25457999110221863, + "learning_rate": 2.229885507328776e-07, + "loss": -0.0016, + "num_tokens": 17203584.0, + "reward": 13.730911254882812, + "reward_std": 0.5785952210426331, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.8086965084075928, + "rewards/length2tails_reward/std": 0.2609972357749939, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5897364616394043, + "rewards/thermo_reward/std": 0.5061468482017517, + "step": 1970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.71875, + "completions/mean_terminated_length": 272.71875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08229434955865145, + "epoch": 3.942, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11690180748701096, + "learning_rate": 2.2218201014220262e-07, + "loss": 0.0013, + "num_tokens": 17212343.0, + "reward": 13.480477333068848, + "reward_std": 1.8184738159179688, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.498993396759033, + "rewards/kidney_reward/std": 0.7211709022521973, + "rewards/length2tails_reward/mean": 0.8054592609405518, + "rewards/length2tails_reward/std": 0.22931167483329773, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4397528171539307, + "rewards/thermo_reward/std": 1.1389209032058716, + "step": 1971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.625, + "completions/mean_terminated_length": 272.625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.07960274396464229, + "epoch": 3.944, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12034840136766434, + "learning_rate": 2.2137674847777576e-07, + "loss": 0.0023, + "num_tokens": 17221099.0, + "reward": 13.918210983276367, + "reward_std": 0.31029683351516724, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8126543164253235, + "rewards/length2tails_reward/std": 0.22591090202331543, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 272.1875, + "completions/mean_terminated_length": 272.1875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.09035251941531897, + "epoch": 3.9459999999999997, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07418270409107208, + "learning_rate": 2.205727670636478e-07, + "loss": -0.0054, + "num_tokens": 17229841.0, + "reward": 13.46790885925293, + "reward_std": 3.006690502166748, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.5390896797180176, + "rewards/kidney_reward/std": 0.49435171484947205, + "rewards/length2tails_reward/mean": 0.8010534048080444, + "rewards/length2tails_reward/std": 0.26960352063179016, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.69566011428833, + "rewards/thermo_reward/std": 0.7545809149742126, + "step": 1973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 274.09375, + "completions/mean_terminated_length": 274.09375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08542555291205645, + "epoch": 3.948, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13188742101192474, + "learning_rate": 2.197700672217635e-07, + "loss": -0.0042, + "num_tokens": 17238644.0, + "reward": 12.878170013427734, + "reward_std": 4.818774700164795, + "rewards/fitness_reward/mean": 7.000814437866211, + "rewards/fitness_reward/std": 2.0385630130767822, + "rewards/kidney_reward/mean": 2.3968443870544434, + "rewards/kidney_reward/std": 1.14970862865448, + "rewards/length2tails_reward/mean": 0.7748308777809143, + "rewards/length2tails_reward/std": 0.26282769441604614, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3030285835266113, + "rewards/thermo_reward/std": 1.8083046674728394, + "step": 1974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.4375, + "completions/mean_terminated_length": 272.4375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08252169284969568, + "epoch": 3.95, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09868843853473663, + "learning_rate": 2.1896865027196143e-07, + "loss": 0.0019, + "num_tokens": 17247394.0, + "reward": 13.794919967651367, + "reward_std": 0.4678279757499695, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7763217687606812, + "rewards/length2tails_reward/std": 0.28034308552742004, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.3125, + "completions/mean_terminated_length": 272.3125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.0822441759519279, + "epoch": 3.952, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07563626766204834, + "learning_rate": 2.181685175319702e-07, + "loss": -0.0039, + "num_tokens": 17256140.0, + "reward": 13.918163299560547, + "reward_std": 0.3168083131313324, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8121821284294128, + "rewards/length2tails_reward/std": 0.2070104479789734, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7492804527282715, + "rewards/thermo_reward/std": 0.3138989210128784, + "step": 1976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.40625, + "completions/mean_terminated_length": 272.40625, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.08298068400472403, + "epoch": 3.9539999999999997, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0864865854382515, + "learning_rate": 2.1736967031740737e-07, + "loss": -0.001, + "num_tokens": 17264889.0, + "reward": 13.878406524658203, + "reward_std": 0.3799617886543274, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8134766817092896, + "rewards/length2tails_reward/std": 0.18320176005363464, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.28125, + "completions/mean_terminated_length": 272.28125, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "entropy": 0.08510293252766132, + "epoch": 3.956, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07798423618078232, + "learning_rate": 2.1657210994177643e-07, + "loss": 0.0043, + "num_tokens": 17273634.0, + "reward": 13.881671905517578, + "reward_std": 0.37807029485702515, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8461273312568665, + "rewards/length2tails_reward/std": 0.20430795848369598, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.9375, + "completions/mean_terminated_length": 271.9375, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "entropy": 0.08048784267157316, + "epoch": 3.958, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10507718473672867, + "learning_rate": 2.1577583771646467e-07, + "loss": 0.0022, + "num_tokens": 17282368.0, + "reward": 13.877695083618164, + "reward_std": 0.3744347095489502, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.806357741355896, + "rewards/length2tails_reward/std": 0.22114646434783936, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.375, + "completions/mean_terminated_length": 269.375, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.07727627642452717, + "epoch": 3.96, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0931129902601242, + "learning_rate": 2.1498085495074193e-07, + "loss": -0.0019, + "num_tokens": 17291020.0, + "reward": 13.947714805603027, + "reward_std": 0.23339204490184784, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.708836555480957, + "rewards/length2tails_reward/std": 0.31088170409202576, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7891664505004883, + "rewards/thermo_reward/std": 0.225629061460495, + "step": 1980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 271.6875, + "completions/mean_terminated_length": 271.6875, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "entropy": 0.07627571607008576, + "epoch": 3.9619999999999997, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19174930453300476, + "learning_rate": 2.1418716295175766e-07, + "loss": 0.0077, + "num_tokens": 17299746.0, + "reward": 13.401413917541504, + "reward_std": 2.5488474369049072, + "rewards/fitness_reward/mean": 7.303675651550293, + "rewards/fitness_reward/std": 0.3253214955329895, + "rewards/kidney_reward/mean": 2.450007915496826, + "rewards/kidney_reward/std": 0.8526635766029358, + "rewards/length2tails_reward/mean": 0.7977835536003113, + "rewards/length2tails_reward/std": 0.23642198741436005, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.4679512977600098, + "rewards/thermo_reward/std": 1.3959442377090454, + "step": 1981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.5625, + "completions/mean_terminated_length": 271.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08476059697568417, + "epoch": 3.964, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09583441913127899, + "learning_rate": 2.1339476302453873e-07, + "loss": 0.0036, + "num_tokens": 17308468.0, + "reward": 12.910294532775879, + "reward_std": 5.181497573852539, + "rewards/fitness_reward/mean": 6.980969429016113, + "rewards/fitness_reward/std": 2.1508259773254395, + "rewards/kidney_reward/mean": 2.4055721759796143, + "rewards/kidney_reward/std": 1.2496414184570312, + "rewards/length2tails_reward/mean": 0.7198496460914612, + "rewards/length2tails_reward/std": 0.29080212116241455, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3517682552337646, + "rewards/thermo_reward/std": 1.819327712059021, + "step": 1982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.46875, + "completions/mean_terminated_length": 272.46875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08854109048843384, + "epoch": 3.966, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11510281264781952, + "learning_rate": 2.1260365647198797e-07, + "loss": 0.0028, + "num_tokens": 17317219.0, + "reward": 13.87903118133545, + "reward_std": 0.37419331073760986, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8197149038314819, + "rewards/length2tails_reward/std": 0.20691503584384918, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.34375, + "completions/mean_terminated_length": 272.34375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.07455269154161215, + "epoch": 3.968, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20295102894306183, + "learning_rate": 2.118138445948815e-07, + "loss": 0.0036, + "num_tokens": 17325966.0, + "reward": 13.744754791259766, + "reward_std": 0.9908297061920166, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7723274230957031, + "rewards/length2tails_reward/std": 0.27208390831947327, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.6072168350219727, + "rewards/thermo_reward/std": 0.8490945100784302, + "step": 1984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 273.59375, + "completions/mean_terminated_length": 273.59375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08819251973181963, + "epoch": 3.9699999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16574658453464508, + "learning_rate": 2.1102532869186585e-07, + "loss": 0.0052, + "num_tokens": 17334753.0, + "reward": 13.774247169494629, + "reward_std": 0.5482454299926758, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.843185544013977, + "rewards/length2tails_reward/std": 0.22929775714874268, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.629622459411621, + "rewards/thermo_reward/std": 0.4708483815193176, + "step": 1985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "entropy": 0.08976717293262482, + "epoch": 3.972, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18116842210292816, + "learning_rate": 2.102381100594577e-07, + "loss": 0.0013, + "num_tokens": 17343477.0, + "reward": 13.875925064086914, + "reward_std": 0.38003987073898315, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7886615991592407, + "rewards/length2tails_reward/std": 0.25267016887664795, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 274.09375, + "completions/mean_terminated_length": 274.09375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09118715906515718, + "epoch": 3.974, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1178445890545845, + "learning_rate": 2.094521899920403e-07, + "loss": 0.0036, + "num_tokens": 17352280.0, + "reward": 13.685848236083984, + "reward_std": 0.5587000846862793, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8821921944618225, + "rewards/length2tails_reward/std": 0.18764744699001312, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.50996470451355, + "rewards/thermo_reward/std": 0.5615194439888, + "step": 1987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.15625, + "completions/mean_terminated_length": 270.15625, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.07772285584360361, + "epoch": 3.976, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08714106678962708, + "learning_rate": 2.0866756978186162e-07, + "loss": -0.0016, + "num_tokens": 17360957.0, + "reward": 13.827531814575195, + "reward_std": 0.4348272383213043, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7035890221595764, + "rewards/length2tails_reward/std": 0.30343014001846313, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.5625, + "completions/mean_terminated_length": 271.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.07163404440507293, + "epoch": 3.9779999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09531030058860779, + "learning_rate": 2.078842507190328e-07, + "loss": -0.0063, + "num_tokens": 17369679.0, + "reward": 13.312138557434082, + "reward_std": 2.605790376663208, + "rewards/fitness_reward/mean": 7.246166706085205, + "rewards/fitness_reward/std": 0.650642991065979, + "rewards/kidney_reward/mean": 2.5147488117218018, + "rewards/kidney_reward/std": 0.6320452094078064, + "rewards/length2tails_reward/mean": 0.6933004856109619, + "rewards/length2tails_reward/std": 0.31690841913223267, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.3818931579589844, + "rewards/thermo_reward/std": 1.4718438386917114, + "step": 1989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08093912713229656, + "epoch": 3.98, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06563757359981537, + "learning_rate": 2.0710223409152471e-07, + "loss": -0.0047, + "num_tokens": 17378403.0, + "reward": 13.421937942504883, + "reward_std": 3.006488561630249, + "rewards/fitness_reward/mean": 7.053053855895996, + "rewards/fitness_reward/std": 1.7430548667907715, + "rewards/kidney_reward/mean": 2.5390896797180176, + "rewards/kidney_reward/std": 0.49435171484947205, + "rewards/length2tails_reward/mean": 0.7401957511901855, + "rewards/length2tails_reward/std": 0.3054744601249695, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.6557741165161133, + "rewards/thermo_reward/std": 0.7805873155593872, + "step": 1990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.875, + "completions/mean_terminated_length": 271.875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.0830417824909091, + "epoch": 3.982, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08848419785499573, + "learning_rate": 2.0632152118516778e-07, + "loss": -0.0024, + "num_tokens": 17387135.0, + "reward": 13.952494621276855, + "reward_std": 0.23339000344276428, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7566424012184143, + "rewards/length2tails_reward/std": 0.23912560939788818, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7891664505004883, + "rewards/thermo_reward/std": 0.225629061460495, + "step": 1991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.40625, + "completions/mean_terminated_length": 271.40625, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "entropy": 0.09022182878106833, + "epoch": 3.984, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13608083128929138, + "learning_rate": 2.0554211328364745e-07, + "loss": -0.0047, + "num_tokens": 17395852.0, + "reward": 12.72749137878418, + "reward_std": 4.599298000335693, + "rewards/fitness_reward/mean": 7.022955417633057, + "rewards/fitness_reward/std": 1.9133150577545166, + "rewards/kidney_reward/mean": 2.4153106212615967, + "rewards/kidney_reward/std": 1.0462912321090698, + "rewards/length2tails_reward/mean": 0.784592866897583, + "rewards/length2tails_reward/std": 0.2927846610546112, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.1107654571533203, + "rewards/thermo_reward/std": 2.040642499923706, + "step": 1992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.625, + "completions/mean_terminated_length": 272.625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08878266904503107, + "epoch": 3.9859999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0827256366610527, + "learning_rate": 2.0476401166850477e-07, + "loss": -0.0049, + "num_tokens": 17404608.0, + "reward": 13.730037689208984, + "reward_std": 1.076958417892456, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7560330629348755, + "rewards/length2tails_reward/std": 0.28175225853919983, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5941295623779297, + "rewards/thermo_reward/std": 0.918302595615387, + "step": 1993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.03125, + "completions/mean_terminated_length": 272.03125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08085281401872635, + "epoch": 3.988, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09590739011764526, + "learning_rate": 2.0398721761913207e-07, + "loss": 0.0009, + "num_tokens": 17413345.0, + "reward": 13.873292922973633, + "reward_std": 0.3764077425003052, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7623419165611267, + "rewards/length2tails_reward/std": 0.26891589164733887, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7093944549560547, + "rewards/thermo_reward/std": 0.37798434495925903, + "step": 1994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 273.0625, + "completions/mean_terminated_length": 273.0625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.07451770780608058, + "epoch": 3.99, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11704223603010178, + "learning_rate": 2.0321173241277235e-07, + "loss": 0.0013, + "num_tokens": 17422115.0, + "reward": 13.808777809143066, + "reward_std": 0.5187250971794128, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7896385192871094, + "rewards/length2tails_reward/std": 0.24748274683952332, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.15625, + "completions/mean_terminated_length": 272.15625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.07666803011670709, + "epoch": 3.992, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08989939093589783, + "learning_rate": 2.0243755732451562e-07, + "loss": -0.0048, + "num_tokens": 17430856.0, + "reward": 13.739928245544434, + "reward_std": 0.8497186899185181, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.530752658843994, + "rewards/kidney_reward/std": 0.5415143966674805, + "rewards/length2tails_reward/mean": 0.7848218679428101, + "rewards/length2tails_reward/std": 0.27701008319854736, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.669508457183838, + "rewards/thermo_reward/std": 0.42886754870414734, + "step": 1996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.625, + "completions/mean_terminated_length": 272.625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.08358711935579777, + "epoch": 3.9939999999999998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08841954916715622, + "learning_rate": 2.0166469362729865e-07, + "loss": 0.001, + "num_tokens": 17439612.0, + "reward": 13.736370086669922, + "reward_std": 1.049423098564148, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.7948298454284668, + "rewards/length2tails_reward/std": 0.2536636292934418, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.5692219734191895, + "rewards/thermo_reward/std": 1.0518332719802856, + "step": 1997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 271.125, + "completions/mean_terminated_length": 271.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08054515346884727, + "epoch": 3.996, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12520883977413177, + "learning_rate": 2.008931425919015e-07, + "loss": 0.0017, + "num_tokens": 17448320.0, + "reward": 13.648792266845703, + "reward_std": 1.0796316862106323, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7211928367614746, + "rewards/length2tails_reward/std": 0.28733447194099426, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.516367197036743, + "rewards/thermo_reward/std": 0.9401901960372925, + "step": 1998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.78125, + "completions/mean_terminated_length": 272.78125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08685789862647653, + "epoch": 3.998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06330923736095428, + "learning_rate": 2.001229054869461e-07, + "loss": 0.0, + "num_tokens": 17457081.0, + "reward": 13.92857837677002, + "reward_std": 0.37837153673171997, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.599120616912842, + "rewards/kidney_reward/std": 0.15476678311824799, + "rewards/length2tails_reward/mean": 0.7910628318786621, + "rewards/length2tails_reward/std": 0.21263666450977325, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7891664505004883, + "rewards/thermo_reward/std": 0.225629061460495, + "step": 1999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 273.8125, + "completions/mean_terminated_length": 273.8125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.08658518642187119, + "epoch": 4.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08859028667211533, + "learning_rate": 1.9935398357889389e-07, + "loss": -0.0034, + "num_tokens": 17465875.0, + "reward": 13.957188606262207, + "reward_std": 0.22562836110591888, + "rewards/fitness_reward/mean": 7.361185073852539, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 2.6264796257019043, + "rewards/kidney_reward/std": 0.0, + "rewards/length2tails_reward/mean": 0.8035746216773987, + "rewards/length2tails_reward/std": 0.22424465417861938, + "rewards/repeated_in_batch_reward/mean": 1.0, + "rewards/repeated_in_batch_reward/std": 0.0, + "rewards/thermo_reward/mean": 3.7891664505004883, + "rewards/thermo_reward/std": 0.225629061460495, + "step": 2000 + } + ], + "logging_steps": 1, + "max_steps": 2500, + "num_input_tokens_seen": 17465875, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}