{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 270.0625, "completions/mean_terminated_length": 270.0625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.11410666164010763, "epoch": 0.002, "frac_reward_zero_std": 0.0, "grad_norm": 0.6294371485710144, "learning_rate": 0.0, "loss": 0.0098, "num_tokens": 8674.0, "reward": 1.6712751388549805, "reward_std": 11.362724304199219, "rewards/fitness_reward/mean": 1.533573865890503, "rewards/fitness_reward/std": 5.61301851272583, "rewards/kidney_reward/mean": -0.04256998002529144, "rewards/kidney_reward/std": 2.75425124168396, "rewards/length2tails_reward/mean": 0.601272702217102, "rewards/length2tails_reward/std": 0.4386332929134369, "rewards/repeated_in_batch_reward/mean": 0.78125, "rewards/repeated_in_batch_reward/std": 0.420013427734375, "rewards/thermo_reward/mean": 0.042019158601760864, "rewards/thermo_reward/std": 3.2507805824279785, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.5, "completions/mean_terminated_length": 269.5, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0807526521384716, "epoch": 0.004, "frac_reward_zero_std": 0.0, "grad_norm": 0.29795771837234497, "learning_rate": 4e-08, "loss": 0.0027, "num_tokens": 17330.0, "reward": 3.9355084896087646, "reward_std": 10.647903442382812, "rewards/fitness_reward/mean": 2.646754741668701, "rewards/fitness_reward/std": 5.466333866119385, "rewards/kidney_reward/mean": 0.5920517444610596, "rewards/kidney_reward/std": 2.4331250190734863, "rewards/length2tails_reward/mean": 0.5398199558258057, "rewards/length2tails_reward/std": 0.40852677822113037, "rewards/repeated_in_batch_reward/mean": 0.84375, "rewards/repeated_in_batch_reward/std": 0.3689020276069641, "rewards/thermo_reward/mean": 0.5583449602127075, "rewards/thermo_reward/std": 3.1016311645507812, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 266.875, "completions/mean_terminated_length": 266.875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.0851337956264615, "epoch": 0.006, "frac_reward_zero_std": 0.0, "grad_norm": 0.43736904859542847, "learning_rate": 8e-08, "loss": -0.0293, "num_tokens": 25902.0, "reward": 1.0216810703277588, "reward_std": 11.25409984588623, "rewards/fitness_reward/mean": 1.7421739101409912, "rewards/fitness_reward/std": 5.732826232910156, "rewards/kidney_reward/mean": -0.23049303889274597, "rewards/kidney_reward/std": 2.7518129348754883, "rewards/length2tails_reward/mean": 0.7120003700256348, "rewards/length2tails_reward/std": 0.3755108118057251, "rewards/repeated_in_batch_reward/mean": 0.90625, "rewards/repeated_in_batch_reward/std": 0.2961445748806, "rewards/thermo_reward/mean": -0.6518245935440063, "rewards/thermo_reward/std": 3.3029723167419434, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 266.9375, "completions/mean_terminated_length": 266.9375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.08211329486221075, "epoch": 0.008, "frac_reward_zero_std": 0.0, "grad_norm": 0.2764379680156708, "learning_rate": 1.2e-07, "loss": -0.0176, "num_tokens": 34476.0, "reward": 3.567150831222534, "reward_std": 10.611852645874023, "rewards/fitness_reward/mean": 2.5539772510528564, "rewards/fitness_reward/std": 5.466435432434082, "rewards/kidney_reward/mean": 0.4766693413257599, "rewards/kidney_reward/std": 2.4557559490203857, "rewards/length2tails_reward/mean": 0.5894806385040283, "rewards/length2tails_reward/std": 0.4003503620624542, "rewards/repeated_in_batch_reward/mean": 0.8125, "rewards/repeated_in_batch_reward/std": 0.3965577781200409, "rewards/thermo_reward/mean": 0.3963066339492798, "rewards/thermo_reward/std": 3.1332244873046875, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.59375, "completions/mean_terminated_length": 269.59375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.07415536837652326, "epoch": 0.01, "frac_reward_zero_std": 0.0, "grad_norm": 0.22633300721645355, "learning_rate": 1.6e-07, "loss": 0.0054, "num_tokens": 43135.0, "reward": -0.5113043785095215, "reward_std": 10.228713035583496, "rewards/fitness_reward/mean": 0.3053843379020691, "rewards/fitness_reward/std": 5.22194766998291, "rewards/kidney_reward/mean": -0.2857877314090729, "rewards/kidney_reward/std": 2.4033970832824707, "rewards/length2tails_reward/mean": 0.5589621067047119, "rewards/length2tails_reward/std": 0.44231438636779785, "rewards/repeated_in_batch_reward/mean": 0.8125, "rewards/repeated_in_batch_reward/std": 0.3965577781200409, "rewards/thermo_reward/mean": -0.6680471897125244, "rewards/thermo_reward/std": 2.894864320755005, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.46875, "completions/mean_terminated_length": 269.46875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08570086862891912, "epoch": 0.012, "frac_reward_zero_std": 0.0, "grad_norm": 0.372472882270813, "learning_rate": 2e-07, "loss": 0.0024, "num_tokens": 51790.0, "reward": 3.6113057136535645, "reward_std": 10.176191329956055, "rewards/fitness_reward/mean": 2.3192968368530273, "rewards/fitness_reward/std": 5.47819709777832, "rewards/kidney_reward/mean": 0.7390017509460449, "rewards/kidney_reward/std": 2.422557830810547, "rewards/length2tails_reward/mean": 0.5406535863876343, "rewards/length2tails_reward/std": 0.4137319326400757, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.39894187450408936, "rewards/thermo_reward/std": 3.1979048252105713, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 268.59375, "completions/mean_terminated_length": 268.59375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.07309244154021144, "epoch": 0.014, "frac_reward_zero_std": 0.0, "grad_norm": 0.4794768989086151, "learning_rate": 2.4e-07, "loss": -0.002, "num_tokens": 60417.0, "reward": 4.083155632019043, "reward_std": 9.34925365447998, "rewards/fitness_reward/mean": 2.4409427642822266, "rewards/fitness_reward/std": 5.35188627243042, "rewards/kidney_reward/mean": 0.8888282775878906, "rewards/kidney_reward/std": 1.9836840629577637, "rewards/length2tails_reward/mean": 0.4503927230834961, "rewards/length2tails_reward/std": 0.430117130279541, "rewards/repeated_in_batch_reward/mean": 0.75, "rewards/repeated_in_batch_reward/std": 0.4399413466453552, "rewards/thermo_reward/mean": 0.6333456635475159, "rewards/thermo_reward/std": 2.6188883781433105, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 260.3125, "completions/mean_terminated_length": 260.3125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.09032700955867767, "epoch": 0.016, "frac_reward_zero_std": 0.0, "grad_norm": 0.46145445108413696, "learning_rate": 2.8e-07, "loss": -0.066, "num_tokens": 68779.0, "reward": 2.6856346130371094, "reward_std": 11.17774772644043, "rewards/fitness_reward/mean": 2.2029271125793457, "rewards/fitness_reward/std": 5.6162285804748535, "rewards/kidney_reward/mean": 0.2928307056427002, "rewards/kidney_reward/std": 2.640321969985962, "rewards/length2tails_reward/mean": 0.6909781694412231, "rewards/length2tails_reward/std": 0.3927857577800751, "rewards/repeated_in_batch_reward/mean": 0.875, "rewards/repeated_in_batch_reward/std": 0.33601075410842896, "rewards/thermo_reward/mean": 0.03327919542789459, "rewards/thermo_reward/std": 3.2679364681243896, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 251.71875, "completions/mean_terminated_length": 251.71875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.10597977228462696, "epoch": 0.018, "frac_reward_zero_std": 0.0, "grad_norm": 0.4298640191555023, "learning_rate": 3.2e-07, "loss": -0.0897, "num_tokens": 76866.0, "reward": -1.6654566526412964, "reward_std": 11.136141777038574, "rewards/fitness_reward/mean": 0.48509520292282104, "rewards/fitness_reward/std": 5.592866897583008, "rewards/kidney_reward/mean": -0.9065221548080444, "rewards/kidney_reward/std": 2.818354606628418, "rewards/length2tails_reward/mean": 0.7829936742782593, "rewards/length2tails_reward/std": 0.35669487714767456, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": -1.4160789251327515, "rewards/thermo_reward/std": 3.1695234775543213, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.03125, "completions/mean_terminated_length": 270.03125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08620737865567207, "epoch": 0.02, "frac_reward_zero_std": 0.0, "grad_norm": 0.21064135432243347, "learning_rate": 3.6e-07, "loss": 0.0034, "num_tokens": 85539.0, "reward": 3.2720069885253906, "reward_std": 10.456210136413574, "rewards/fitness_reward/mean": 2.608745574951172, "rewards/fitness_reward/std": 5.498191833496094, "rewards/kidney_reward/mean": 0.5225638151168823, "rewards/kidney_reward/std": 2.4604580402374268, "rewards/length2tails_reward/mean": 0.5517613887786865, "rewards/length2tails_reward/std": 0.44645097851753235, "rewards/repeated_in_batch_reward/mean": 0.90625, "rewards/repeated_in_batch_reward/std": 0.2961445748806, "rewards/thermo_reward/mean": -0.005103394389152527, "rewards/thermo_reward/std": 3.0479538440704346, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.125, "completions/mean_terminated_length": 270.125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08696451969444752, "epoch": 0.022, "frac_reward_zero_std": 0.0, "grad_norm": 0.26834744215011597, "learning_rate": 4e-07, "loss": -0.001, "num_tokens": 94215.0, "reward": 6.2407965660095215, "reward_std": 9.487542152404785, "rewards/fitness_reward/mean": 3.9930999279022217, "rewards/fitness_reward/std": 4.978099346160889, "rewards/kidney_reward/mean": 0.9276078343391418, "rewards/kidney_reward/std": 2.1969923973083496, "rewards/length2tails_reward/mean": 0.5903832912445068, "rewards/length2tails_reward/std": 0.4178674817085266, "rewards/repeated_in_batch_reward/mean": 0.84375, "rewards/repeated_in_batch_reward/std": 0.3689020276069641, "rewards/thermo_reward/mean": 1.176675796508789, "rewards/thermo_reward/std": 2.9462826251983643, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 268.15625, "completions/mean_terminated_length": 268.15625, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "entropy": 0.09193634986877441, "epoch": 0.024, "frac_reward_zero_std": 0.0, "grad_norm": 0.6354812979698181, "learning_rate": 4.3999999999999997e-07, "loss": 0.0025, "num_tokens": 102828.0, "reward": 0.7271066308021545, "reward_std": 10.12745475769043, "rewards/fitness_reward/mean": 1.473478078842163, "rewards/fitness_reward/std": 5.440088272094727, "rewards/kidney_reward/mean": -0.10944777727127075, "rewards/kidney_reward/std": 2.518666982650757, "rewards/length2tails_reward/mean": 0.49587365984916687, "rewards/length2tails_reward/std": 0.4503480792045593, "rewards/repeated_in_batch_reward/mean": 0.78125, "rewards/repeated_in_batch_reward/std": 0.420013427734375, "rewards/thermo_reward/mean": -0.7646359205245972, "rewards/thermo_reward/std": 2.862889051437378, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 266.03125, "completions/mean_terminated_length": 266.03125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.08739372063428164, "epoch": 0.026, "frac_reward_zero_std": 0.0, "grad_norm": 0.2770998775959015, "learning_rate": 4.8e-07, "loss": -0.0301, "num_tokens": 111373.0, "reward": 0.3932121992111206, "reward_std": 10.29038143157959, "rewards/fitness_reward/mean": 1.5364488363265991, "rewards/fitness_reward/std": 5.486446857452393, "rewards/kidney_reward/mean": -0.43019402027130127, "rewards/kidney_reward/std": 2.4975287914276123, "rewards/length2tails_reward/mean": 0.6225026845932007, "rewards/length2tails_reward/std": 0.44325339794158936, "rewards/repeated_in_batch_reward/mean": 0.8125, "rewards/repeated_in_batch_reward/std": 0.3965577781200409, "rewards/thermo_reward/mean": -0.856542706489563, "rewards/thermo_reward/std": 3.0668582916259766, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 269.0, "completions/mean_terminated_length": 269.0, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.07466705655679107, "epoch": 0.028, "frac_reward_zero_std": 0.0, "grad_norm": 0.268663227558136, "learning_rate": 5.2e-07, "loss": -0.0038, "num_tokens": 120013.0, "reward": 3.62774658203125, "reward_std": 9.2737455368042, "rewards/fitness_reward/mean": 1.760581374168396, "rewards/fitness_reward/std": 5.374361038208008, "rewards/kidney_reward/mean": 0.8924310803413391, "rewards/kidney_reward/std": 1.9723899364471436, "rewards/length2tails_reward/mean": 0.43759018182754517, "rewards/length2tails_reward/std": 0.46352171897888184, "rewards/repeated_in_batch_reward/mean": 0.75, "rewards/repeated_in_batch_reward/std": 0.4399413466453552, "rewards/thermo_reward/mean": 0.8559751510620117, "rewards/thermo_reward/std": 2.598565101623535, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 269.71875, "completions/mean_terminated_length": 269.71875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.07777309278026223, "epoch": 0.03, "frac_reward_zero_std": 0.0, "grad_norm": 0.1419842541217804, "learning_rate": 5.6e-07, "loss": 0.0002, "num_tokens": 128676.0, "reward": -0.15081048011779785, "reward_std": 10.177221298217773, "rewards/fitness_reward/mean": 0.7644028663635254, "rewards/fitness_reward/std": 5.267439365386963, "rewards/kidney_reward/mean": -0.33808475732803345, "rewards/kidney_reward/std": 2.4396588802337646, "rewards/length2tails_reward/mean": 0.5479411482810974, "rewards/length2tails_reward/std": 0.4623284339904785, "rewards/repeated_in_batch_reward/mean": 0.78125, "rewards/repeated_in_batch_reward/std": 0.420013427734375, "rewards/thermo_reward/mean": -0.7100476622581482, "rewards/thermo_reward/std": 2.836381435394287, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 266.84375, "completions/mean_terminated_length": 266.84375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.07267575850710273, "epoch": 0.032, "frac_reward_zero_std": 0.0, "grad_norm": 0.11633159220218658, "learning_rate": 6e-07, "loss": -0.0115, "num_tokens": 137247.0, "reward": -1.5944565534591675, "reward_std": 10.127143859863281, "rewards/fitness_reward/mean": -0.03426043689250946, "rewards/fitness_reward/std": 5.2975969314575195, "rewards/kidney_reward/mean": -0.5960159301757812, "rewards/kidney_reward/std": 2.679882526397705, "rewards/length2tails_reward/mean": 0.5157021284103394, "rewards/length2tails_reward/std": 0.45434051752090454, "rewards/repeated_in_batch_reward/mean": 0.84375, "rewards/repeated_in_batch_reward/std": 0.3689020276069641, "rewards/thermo_reward/mean": -1.1001253128051758, "rewards/thermo_reward/std": 2.988361120223999, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.15625, "completions/mean_terminated_length": 269.15625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.07479206379503012, "epoch": 0.034, "frac_reward_zero_std": 0.0, "grad_norm": 0.3478950262069702, "learning_rate": 6.4e-07, "loss": 0.0022, "num_tokens": 145892.0, "reward": 1.75685715675354, "reward_std": 10.682146072387695, "rewards/fitness_reward/mean": 1.3270858526229858, "rewards/fitness_reward/std": 5.450982093811035, "rewards/kidney_reward/mean": 0.19966863095760345, "rewards/kidney_reward/std": 2.483642816543579, "rewards/length2tails_reward/mean": 0.5242193341255188, "rewards/length2tails_reward/std": 0.4437546133995056, "rewards/repeated_in_batch_reward/mean": 0.78125, "rewards/repeated_in_batch_reward/std": 0.420013427734375, "rewards/thermo_reward/mean": 0.09955573081970215, "rewards/thermo_reward/std": 3.0443100929260254, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.8125, "completions/mean_terminated_length": 269.8125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.0834053922444582, "epoch": 0.036, "frac_reward_zero_std": 0.0, "grad_norm": 0.32707831263542175, "learning_rate": 6.800000000000001e-07, "loss": 0.0012, "num_tokens": 154558.0, "reward": 4.515446186065674, "reward_std": 9.859593391418457, "rewards/fitness_reward/mean": 3.1498522758483887, "rewards/fitness_reward/std": 5.432015895843506, "rewards/kidney_reward/mean": 0.5560940504074097, "rewards/kidney_reward/std": 2.2329390048980713, "rewards/length2tails_reward/mean": 0.5659103393554688, "rewards/length2tails_reward/std": 0.41648009419441223, "rewards/repeated_in_batch_reward/mean": 0.90625, "rewards/repeated_in_batch_reward/std": 0.2961445748806, "rewards/thermo_reward/mean": 0.6622838973999023, "rewards/thermo_reward/std": 3.0777931213378906, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.125, "completions/mean_terminated_length": 270.125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08045468153432012, "epoch": 0.038, "frac_reward_zero_std": 0.0, "grad_norm": 0.3290388584136963, "learning_rate": 7.2e-07, "loss": -0.0034, "num_tokens": 163234.0, "reward": 2.1541008949279785, "reward_std": 9.83704662322998, "rewards/fitness_reward/mean": 1.7646312713623047, "rewards/fitness_reward/std": 5.3715972900390625, "rewards/kidney_reward/mean": 0.3831119239330292, "rewards/kidney_reward/std": 2.294931650161743, "rewards/length2tails_reward/mean": 0.5611617565155029, "rewards/length2tails_reward/std": 0.43626290559768677, "rewards/repeated_in_batch_reward/mean": 0.71875, "rewards/repeated_in_batch_reward/std": 0.45680341124534607, "rewards/thermo_reward/mean": -0.1216331273317337, "rewards/thermo_reward/std": 2.7697622776031494, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 266.0, "completions/mean_terminated_length": 266.0, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.09172683954238892, "epoch": 0.04, "frac_reward_zero_std": 0.0, "grad_norm": 0.23171117901802063, "learning_rate": 7.599999999999999e-07, "loss": -0.0154, "num_tokens": 171778.0, "reward": 2.084986925125122, "reward_std": 11.089282035827637, "rewards/fitness_reward/mean": 2.350027084350586, "rewards/fitness_reward/std": 5.590826988220215, "rewards/kidney_reward/mean": 0.028891414403915405, "rewards/kidney_reward/std": 2.7076759338378906, "rewards/length2tails_reward/mean": 0.5892895460128784, "rewards/length2tails_reward/std": 0.4344477653503418, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": -0.45286035537719727, "rewards/thermo_reward/std": 3.284806728363037, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 264.34375, "completions/mean_terminated_length": 264.34375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.07852034131065011, "epoch": 0.042, "frac_reward_zero_std": 0.0, "grad_norm": 0.23757749795913696, "learning_rate": 8e-07, "loss": -0.0244, "num_tokens": 180269.0, "reward": -1.1749228239059448, "reward_std": 10.359306335449219, "rewards/fitness_reward/mean": 0.41968971490859985, "rewards/fitness_reward/std": 5.393928527832031, "rewards/kidney_reward/mean": -0.6090096235275269, "rewards/kidney_reward/std": 2.580422878265381, "rewards/length2tails_reward/mean": 0.6238871812820435, "rewards/length2tails_reward/std": 0.4270743131637573, "rewards/repeated_in_batch_reward/mean": 0.8125, "rewards/repeated_in_batch_reward/std": 0.3965577781200409, "rewards/thermo_reward/mean": -1.129241704940796, "rewards/thermo_reward/std": 2.9064016342163086, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 269.03125, "completions/mean_terminated_length": 269.03125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0786933982744813, "epoch": 0.044, "frac_reward_zero_std": 0.0, "grad_norm": 0.5175756812095642, "learning_rate": 8.399999999999999e-07, "loss": 0.0011, "num_tokens": 188910.0, "reward": 3.9556326866149902, "reward_std": 10.06831169128418, "rewards/fitness_reward/mean": 2.6229472160339355, "rewards/fitness_reward/std": 5.284468173980713, "rewards/kidney_reward/mean": 0.5688485503196716, "rewards/kidney_reward/std": 2.2570488452911377, "rewards/length2tails_reward/mean": 0.5303610563278198, "rewards/length2tails_reward/std": 0.42760950326919556, "rewards/repeated_in_batch_reward/mean": 0.8125, "rewards/repeated_in_batch_reward/std": 0.3965577781200409, "rewards/thermo_reward/mean": 0.629551112651825, "rewards/thermo_reward/std": 2.8939197063446045, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 264.125, "completions/mean_terminated_length": 264.125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.08195907855406404, "epoch": 0.046, "frac_reward_zero_std": 0.0, "grad_norm": 0.3343675434589386, "learning_rate": 8.799999999999999e-07, "loss": -0.0321, "num_tokens": 197394.0, "reward": 3.067450523376465, "reward_std": 10.474081993103027, "rewards/fitness_reward/mean": 2.238030195236206, "rewards/fitness_reward/std": 5.569760799407959, "rewards/kidney_reward/mean": 0.4576724171638489, "rewards/kidney_reward/std": 2.436845064163208, "rewards/length2tails_reward/mean": 0.5516926050186157, "rewards/length2tails_reward/std": 0.43515467643737793, "rewards/repeated_in_batch_reward/mean": 0.84375, "rewards/repeated_in_batch_reward/std": 0.3689020276069641, "rewards/thermo_reward/mean": 0.2322039157152176, "rewards/thermo_reward/std": 3.0297348499298096, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 268.78125, "completions/mean_terminated_length": 268.78125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.10035552131012082, "epoch": 0.048, "frac_reward_zero_std": 0.0, "grad_norm": 0.3725874722003937, "learning_rate": 9.2e-07, "loss": -0.0084, "num_tokens": 206027.0, "reward": 2.082176685333252, "reward_std": 11.04091739654541, "rewards/fitness_reward/mean": 1.8071531057357788, "rewards/fitness_reward/std": 5.459597587585449, "rewards/kidney_reward/mean": -0.025432132184505463, "rewards/kidney_reward/std": 2.6043825149536133, "rewards/length2tails_reward/mean": 0.6623145937919617, "rewards/length2tails_reward/std": 0.3805418014526367, "rewards/repeated_in_batch_reward/mean": 0.84375, "rewards/repeated_in_batch_reward/std": 0.3689020276069641, "rewards/thermo_reward/mean": 0.14984972774982452, "rewards/thermo_reward/std": 3.2950596809387207, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 270.6875, "completions/mean_terminated_length": 270.6875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08541291346773505, "epoch": 0.05, "frac_reward_zero_std": 0.0, "grad_norm": 0.20709945261478424, "learning_rate": 9.6e-07, "loss": 0.0025, "num_tokens": 214721.0, "reward": 2.1533591747283936, "reward_std": 10.898603439331055, "rewards/fitness_reward/mean": 1.8978402614593506, "rewards/fitness_reward/std": 5.5759453773498535, "rewards/kidney_reward/mean": 0.1307317316532135, "rewards/kidney_reward/std": 2.487185001373291, "rewards/length2tails_reward/mean": 0.6618248224258423, "rewards/length2tails_reward/std": 0.4177837669849396, "rewards/repeated_in_batch_reward/mean": 0.84375, "rewards/repeated_in_batch_reward/std": 0.3689020276069641, "rewards/thermo_reward/mean": -0.025770097970962524, "rewards/thermo_reward/std": 3.1279430389404297, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 271.4375, "completions/mean_terminated_length": 271.4375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.101377141661942, "epoch": 0.052, "frac_reward_zero_std": 0.0, "grad_norm": 0.6182202100753784, "learning_rate": 1e-06, "loss": 0.0049, "num_tokens": 223439.0, "reward": -0.09726998209953308, "reward_std": 10.948629379272461, "rewards/fitness_reward/mean": 1.2619261741638184, "rewards/fitness_reward/std": 5.739231109619141, "rewards/kidney_reward/mean": -0.5961160063743591, "rewards/kidney_reward/std": 2.6597819328308105, "rewards/length2tails_reward/mean": 0.6376259326934814, "rewards/length2tails_reward/std": 0.4443195164203644, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": -0.9205924272537231, "rewards/thermo_reward/std": 3.2762088775634766, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.25, "completions/mean_terminated_length": 270.25, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10240273922681808, "epoch": 0.054, "frac_reward_zero_std": 0.0, "grad_norm": 0.13618984818458557, "learning_rate": 1.04e-06, "loss": 0.0074, "num_tokens": 232119.0, "reward": 2.9474754333496094, "reward_std": 11.085765838623047, "rewards/fitness_reward/mean": 2.818065643310547, "rewards/fitness_reward/std": 5.607751846313477, "rewards/kidney_reward/mean": 0.2121862769126892, "rewards/kidney_reward/std": 2.740105152130127, "rewards/length2tails_reward/mean": 0.6210638284683228, "rewards/length2tails_reward/std": 0.393775999546051, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": -0.24488294124603271, "rewards/thermo_reward/std": 3.3333635330200195, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 267.375, "completions/mean_terminated_length": 267.375, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.08450194029137492, "epoch": 0.056, "frac_reward_zero_std": 0.0, "grad_norm": 0.5073533654212952, "learning_rate": 1.08e-06, "loss": -0.0081, "num_tokens": 240707.0, "reward": 4.150449752807617, "reward_std": 10.743106842041016, "rewards/fitness_reward/mean": 2.5777180194854736, "rewards/fitness_reward/std": 5.342399597167969, "rewards/kidney_reward/mean": 0.5591606497764587, "rewards/kidney_reward/std": 2.441612482070923, "rewards/length2tails_reward/mean": 0.5295137166976929, "rewards/length2tails_reward/std": 0.4105660319328308, "rewards/repeated_in_batch_reward/mean": 0.78125, "rewards/repeated_in_batch_reward/std": 0.420013427734375, "rewards/thermo_reward/mean": 0.8824952244758606, "rewards/thermo_reward/std": 3.181992769241333, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.0625, "completions/mean_terminated_length": 271.0625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08082179352641106, "epoch": 0.058, "frac_reward_zero_std": 0.0, "grad_norm": 0.15366347134113312, "learning_rate": 1.12e-06, "loss": 0.0001, "num_tokens": 249413.0, "reward": 1.9728987216949463, "reward_std": 10.83619499206543, "rewards/fitness_reward/mean": 1.815950632095337, "rewards/fitness_reward/std": 5.659666538238525, "rewards/kidney_reward/mean": 0.1731961965560913, "rewards/kidney_reward/std": 2.6282520294189453, "rewards/length2tails_reward/mean": 0.6286097764968872, "rewards/length2tails_reward/std": 0.43655481934547424, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": -0.17285895347595215, "rewards/thermo_reward/std": 3.2424631118774414, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.96875, "completions/mean_terminated_length": 270.96875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08776107709854841, "epoch": 0.06, "frac_reward_zero_std": 0.0, "grad_norm": 0.11649761348962784, "learning_rate": 1.16e-06, "loss": 0.0009, "num_tokens": 258116.0, "reward": 5.3024468421936035, "reward_std": 10.292346954345703, "rewards/fitness_reward/mean": 3.8956284523010254, "rewards/fitness_reward/std": 5.090051174163818, "rewards/kidney_reward/mean": 0.7544621229171753, "rewards/kidney_reward/std": 2.5224790573120117, "rewards/length2tails_reward/mean": 0.685206413269043, "rewards/length2tails_reward/std": 0.3567609190940857, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 0.4900863766670227, "rewards/thermo_reward/std": 3.1310408115386963, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 269.9375, "completions/mean_terminated_length": 269.9375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.07893102057278156, "epoch": 0.062, "frac_reward_zero_std": 0.0, "grad_norm": 0.2900700271129608, "learning_rate": 1.2e-06, "loss": 0.005, "num_tokens": 266786.0, "reward": 1.427734375, "reward_std": 11.001752853393555, "rewards/fitness_reward/mean": 1.4691399335861206, "rewards/fitness_reward/std": 5.541495323181152, "rewards/kidney_reward/mean": -0.09434545040130615, "rewards/kidney_reward/std": 2.562530994415283, "rewards/length2tails_reward/mean": 0.6205853819847107, "rewards/length2tails_reward/std": 0.4454227685928345, "rewards/repeated_in_batch_reward/mean": 0.875, "rewards/repeated_in_batch_reward/std": 0.33601075410842896, "rewards/thermo_reward/mean": -0.09661871194839478, "rewards/thermo_reward/std": 3.113922119140625, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09993747808039188, "epoch": 0.064, "frac_reward_zero_std": 0.0, "grad_norm": 0.22942373156547546, "learning_rate": 1.24e-06, "loss": 0.001, "num_tokens": 275502.0, "reward": 2.7124903202056885, "reward_std": 10.51668643951416, "rewards/fitness_reward/mean": 2.0883593559265137, "rewards/fitness_reward/std": 5.528704643249512, "rewards/kidney_reward/mean": 0.3384060561656952, "rewards/kidney_reward/std": 2.4939019680023193, "rewards/length2tails_reward/mean": 0.6918207406997681, "rewards/length2tails_reward/std": 0.4003857672214508, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 0.12279275059700012, "rewards/thermo_reward/std": 3.189255714416504, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.25, "completions/mean_terminated_length": 270.25, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08446257747709751, "epoch": 0.066, "frac_reward_zero_std": 0.0, "grad_norm": 0.19864203035831451, "learning_rate": 1.28e-06, "loss": -0.0032, "num_tokens": 284182.0, "reward": 4.002676486968994, "reward_std": 9.273280143737793, "rewards/fitness_reward/mean": 2.842696189880371, "rewards/fitness_reward/std": 5.155837535858154, "rewards/kidney_reward/mean": 0.535129964351654, "rewards/kidney_reward/std": 2.1154778003692627, "rewards/length2tails_reward/mean": 0.5688945055007935, "rewards/length2tails_reward/std": 0.4270206093788147, "rewards/repeated_in_batch_reward/mean": 0.8125, "rewards/repeated_in_batch_reward/std": 0.3965577781200409, "rewards/thermo_reward/mean": 0.486710786819458, "rewards/thermo_reward/std": 2.8715450763702393, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.09375, "completions/mean_terminated_length": 270.09375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.09240176528692245, "epoch": 0.068, "frac_reward_zero_std": 0.0, "grad_norm": 0.17734283208847046, "learning_rate": 1.32e-06, "loss": -0.0013, "num_tokens": 292857.0, "reward": 7.405562400817871, "reward_std": 8.667160034179688, "rewards/fitness_reward/mean": 4.910820007324219, "rewards/fitness_reward/std": 4.48259973526001, "rewards/kidney_reward/mean": 1.2963453531265259, "rewards/kidney_reward/std": 1.9723432064056396, "rewards/length2tails_reward/mean": 0.5431360006332397, "rewards/length2tails_reward/std": 0.43262580037117004, "rewards/repeated_in_batch_reward/mean": 0.90625, "rewards/repeated_in_batch_reward/std": 0.2961445748806, "rewards/thermo_reward/mean": 1.0534589290618896, "rewards/thermo_reward/std": 2.9402530193328857, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.34375, "completions/mean_terminated_length": 270.34375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08174511976540089, "epoch": 0.07, "frac_reward_zero_std": 0.0, "grad_norm": 0.1431119590997696, "learning_rate": 1.3600000000000001e-06, "loss": -0.0001, "num_tokens": 301540.0, "reward": 3.6283020973205566, "reward_std": 10.12381362915039, "rewards/fitness_reward/mean": 2.708242893218994, "rewards/fitness_reward/std": 5.337825775146484, "rewards/kidney_reward/mean": 0.5316824913024902, "rewards/kidney_reward/std": 2.3590903282165527, "rewards/length2tails_reward/mean": 0.5538583993911743, "rewards/length2tails_reward/std": 0.434944212436676, "rewards/repeated_in_batch_reward/mean": 0.90625, "rewards/repeated_in_batch_reward/std": 0.2961445748806, "rewards/thermo_reward/mean": 0.242366224527359, "rewards/thermo_reward/std": 3.0170631408691406, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.8125, "completions/mean_terminated_length": 269.8125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09009748417884111, "epoch": 0.072, "frac_reward_zero_std": 0.0, "grad_norm": 0.33221614360809326, "learning_rate": 1.4e-06, "loss": 0.0038, "num_tokens": 310206.0, "reward": 3.3963706493377686, "reward_std": 9.3759765625, "rewards/fitness_reward/mean": 3.760885715484619, "rewards/fitness_reward/std": 5.113700866699219, "rewards/kidney_reward/mean": 0.23802819848060608, "rewards/kidney_reward/std": 2.434286594390869, "rewards/length2tails_reward/mean": 0.5747653245925903, "rewards/length2tails_reward/std": 0.4042164981365204, "rewards/repeated_in_batch_reward/mean": 0.90625, "rewards/repeated_in_batch_reward/std": 0.2961445748806, "rewards/thermo_reward/mean": -0.7506450414657593, "rewards/thermo_reward/std": 2.9914844036102295, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.5, "completions/mean_terminated_length": 269.5, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08821279183030128, "epoch": 0.074, "frac_reward_zero_std": 0.0, "grad_norm": 0.1559227555990219, "learning_rate": 1.44e-06, "loss": 0.0016, "num_tokens": 318862.0, "reward": 4.784730911254883, "reward_std": 10.130960464477539, "rewards/fitness_reward/mean": 3.2076220512390137, "rewards/fitness_reward/std": 5.36002779006958, "rewards/kidney_reward/mean": 0.719211757183075, "rewards/kidney_reward/std": 2.3217809200286865, "rewards/length2tails_reward/mean": 0.5281630158424377, "rewards/length2tails_reward/std": 0.40869244933128357, "rewards/repeated_in_batch_reward/mean": 0.90625, "rewards/repeated_in_batch_reward/std": 0.2961445748806, "rewards/thermo_reward/mean": 0.7144560813903809, "rewards/thermo_reward/std": 3.1187803745269775, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.5, "completions/mean_terminated_length": 269.5, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.08450527861714363, "epoch": 0.076, "frac_reward_zero_std": 0.0, "grad_norm": 0.18637605011463165, "learning_rate": 1.48e-06, "loss": -0.0027, "num_tokens": 327518.0, "reward": 6.34855842590332, "reward_std": 8.797586441040039, "rewards/fitness_reward/mean": 4.340523719787598, "rewards/fitness_reward/std": 4.804779052734375, "rewards/kidney_reward/mean": 1.185357689857483, "rewards/kidney_reward/std": 2.0141444206237793, "rewards/length2tails_reward/mean": 0.48803573846817017, "rewards/length2tails_reward/std": 0.4352822005748749, "rewards/repeated_in_batch_reward/mean": 0.875, "rewards/repeated_in_batch_reward/std": 0.33601075410842896, "rewards/thermo_reward/mean": 0.6863734722137451, "rewards/thermo_reward/std": 2.7634387016296387, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.0625, "completions/mean_terminated_length": 270.0625, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "entropy": 0.09006988536566496, "epoch": 0.078, "frac_reward_zero_std": 0.0, "grad_norm": 0.2921997308731079, "learning_rate": 1.5199999999999998e-06, "loss": -0.0037, "num_tokens": 336192.0, "reward": 3.6054391860961914, "reward_std": 10.492602348327637, "rewards/fitness_reward/mean": 2.8815646171569824, "rewards/fitness_reward/std": 5.417086124420166, "rewards/kidney_reward/mean": 0.4850673973560333, "rewards/kidney_reward/std": 2.5406293869018555, "rewards/length2tails_reward/mean": 0.6245236396789551, "rewards/length2tails_reward/std": 0.4255659580230713, "rewards/repeated_in_batch_reward/mean": 0.875, "rewards/repeated_in_batch_reward/std": 0.33601075410842896, "rewards/thermo_reward/mean": 0.0888550728559494, "rewards/thermo_reward/std": 3.0721518993377686, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 271.03125, "completions/mean_terminated_length": 271.03125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09613204980269074, "epoch": 0.08, "frac_reward_zero_std": 0.0, "grad_norm": 0.12304268777370453, "learning_rate": 1.5599999999999999e-06, "loss": 0.0001, "num_tokens": 344897.0, "reward": 2.7564549446105957, "reward_std": 10.53861141204834, "rewards/fitness_reward/mean": 2.2039315700531006, "rewards/fitness_reward/std": 5.489523887634277, "rewards/kidney_reward/mean": 0.2535344660282135, "rewards/kidney_reward/std": 2.427860736846924, "rewards/length2tails_reward/mean": 0.6793840527534485, "rewards/length2tails_reward/std": 0.3961794376373291, "rewards/repeated_in_batch_reward/mean": 0.875, "rewards/repeated_in_batch_reward/std": 0.33601075410842896, "rewards/thermo_reward/mean": 0.1435505896806717, "rewards/thermo_reward/std": 3.074892520904541, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 268.96875, "completions/mean_terminated_length": 268.96875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08931446820497513, "epoch": 0.082, "frac_reward_zero_std": 0.0, "grad_norm": 0.1744069755077362, "learning_rate": 1.6e-06, "loss": -0.0007, "num_tokens": 353536.0, "reward": 5.175799369812012, "reward_std": 8.847229957580566, "rewards/fitness_reward/mean": 4.311430931091309, "rewards/fitness_reward/std": 4.850402355194092, "rewards/kidney_reward/mean": 0.6826827526092529, "rewards/kidney_reward/std": 2.2333693504333496, "rewards/length2tails_reward/mean": 0.4906601905822754, "rewards/length2tails_reward/std": 0.4163333475589752, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 0.03886973857879639, "rewards/thermo_reward/std": 3.012194871902466, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 264.375, "completions/mean_terminated_length": 264.375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.08490060642361641, "epoch": 0.084, "frac_reward_zero_std": 0.0, "grad_norm": 0.22794437408447266, "learning_rate": 1.6399999999999998e-06, "loss": -0.0285, "num_tokens": 362028.0, "reward": 2.1772820949554443, "reward_std": 10.892127990722656, "rewards/fitness_reward/mean": 1.9527804851531982, "rewards/fitness_reward/std": 5.5872039794921875, "rewards/kidney_reward/mean": 0.060886450111866, "rewards/kidney_reward/std": 2.6227242946624756, "rewards/length2tails_reward/mean": 0.5533137917518616, "rewards/length2tails_reward/std": 0.44542330503463745, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 0.014533862471580505, "rewards/thermo_reward/std": 3.3594391345977783, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 264.3125, "completions/mean_terminated_length": 264.3125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.09379345551133156, "epoch": 0.086, "frac_reward_zero_std": 0.0, "grad_norm": 0.19165834784507751, "learning_rate": 1.6799999999999998e-06, "loss": -0.0471, "num_tokens": 370518.0, "reward": 5.736239910125732, "reward_std": 9.859665870666504, "rewards/fitness_reward/mean": 4.196285247802734, "rewards/fitness_reward/std": 5.0344109535217285, "rewards/kidney_reward/mean": 1.027575969696045, "rewards/kidney_reward/std": 2.3718574047088623, "rewards/length2tails_reward/mean": 0.6502601504325867, "rewards/length2tails_reward/std": 0.3797384202480316, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 0.3536030650138855, "rewards/thermo_reward/std": 3.109739303588867, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.1875, "completions/mean_terminated_length": 270.1875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09621737897396088, "epoch": 0.088, "frac_reward_zero_std": 0.0, "grad_norm": 0.16425421833992004, "learning_rate": 1.7199999999999998e-06, "loss": 0.001, "num_tokens": 379196.0, "reward": 5.795779228210449, "reward_std": 9.847657203674316, "rewards/fitness_reward/mean": 3.5787415504455566, "rewards/fitness_reward/std": 5.213191986083984, "rewards/kidney_reward/mean": 1.1071183681488037, "rewards/kidney_reward/std": 2.1389410495758057, "rewards/length2tails_reward/mean": 0.6040310263633728, "rewards/length2tails_reward/std": 0.4115493595600128, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 0.9557660818099976, "rewards/thermo_reward/std": 3.0586154460906982, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 268.96875, "completions/mean_terminated_length": 268.96875, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "entropy": 0.0871056062169373, "epoch": 0.09, "frac_reward_zero_std": 0.0, "grad_norm": 0.24304160475730896, "learning_rate": 1.7599999999999999e-06, "loss": 0.0022, "num_tokens": 387835.0, "reward": 4.993832111358643, "reward_std": 10.746659278869629, "rewards/fitness_reward/mean": 3.420689582824707, "rewards/fitness_reward/std": 5.437535285949707, "rewards/kidney_reward/mean": 0.7717792987823486, "rewards/kidney_reward/std": 2.535116672515869, "rewards/length2tails_reward/mean": 0.5175907611846924, "rewards/length2tails_reward/std": 0.45717287063598633, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.6496044397354126, "rewards/thermo_reward/std": 3.1371889114379883, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.6875, "completions/mean_terminated_length": 269.6875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0908602443523705, "epoch": 0.092, "frac_reward_zero_std": 0.0, "grad_norm": 0.11827373504638672, "learning_rate": 1.8e-06, "loss": 0.0039, "num_tokens": 396497.0, "reward": 5.899564743041992, "reward_std": 9.54543399810791, "rewards/fitness_reward/mean": 4.488227844238281, "rewards/fitness_reward/std": 4.842160701751709, "rewards/kidney_reward/mean": 0.8022502660751343, "rewards/kidney_reward/std": 2.440622091293335, "rewards/length2tails_reward/mean": 0.566992461681366, "rewards/length2tails_reward/std": 0.3695901930332184, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 0.4586372375488281, "rewards/thermo_reward/std": 3.1700398921966553, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.28125, "completions/mean_terminated_length": 271.28125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08991517964750528, "epoch": 0.094, "frac_reward_zero_std": 0.0, "grad_norm": 0.07967487722635269, "learning_rate": 1.84e-06, "loss": 0.0012, "num_tokens": 405210.0, "reward": 7.124594211578369, "reward_std": 9.315996170043945, "rewards/fitness_reward/mean": 4.596066474914551, "rewards/fitness_reward/std": 4.757828235626221, "rewards/kidney_reward/mean": 1.2813934087753296, "rewards/kidney_reward/std": 2.2047688961029053, "rewards/length2tails_reward/mean": 0.6318175792694092, "rewards/length2tails_reward/std": 0.4136396050453186, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 1.0902026891708374, "rewards/thermo_reward/std": 2.822467565536499, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.125, "completions/mean_terminated_length": 270.125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08989114267751575, "epoch": 0.096, "frac_reward_zero_std": 0.0, "grad_norm": 0.0939304381608963, "learning_rate": 1.8799999999999998e-06, "loss": 0.0039, "num_tokens": 413886.0, "reward": 4.650942325592041, "reward_std": 10.98830509185791, "rewards/fitness_reward/mean": 3.210702419281006, "rewards/fitness_reward/std": 5.458611011505127, "rewards/kidney_reward/mean": 0.6795892119407654, "rewards/kidney_reward/std": 2.532743453979492, "rewards/length2tails_reward/mean": 0.5853027105331421, "rewards/length2tails_reward/std": 0.42769482731819153, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.6021201610565186, "rewards/thermo_reward/std": 3.22406005859375, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.34375, "completions/mean_terminated_length": 270.34375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09917700570076704, "epoch": 0.098, "frac_reward_zero_std": 0.0, "grad_norm": 0.15307842195034027, "learning_rate": 1.92e-06, "loss": 0.0077, "num_tokens": 422569.0, "reward": 5.508678436279297, "reward_std": 10.777318954467773, "rewards/fitness_reward/mean": 3.7935280799865723, "rewards/fitness_reward/std": 5.3776984214782715, "rewards/kidney_reward/mean": 0.8095374703407288, "rewards/kidney_reward/std": 2.691157817840576, "rewards/length2tails_reward/mean": 0.6158257722854614, "rewards/length2tails_reward/std": 0.3776721954345703, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.7440307140350342, "rewards/thermo_reward/std": 3.310176134109497, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.4375, "completions/mean_terminated_length": 270.4375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08912370260804892, "epoch": 0.1, "frac_reward_zero_std": 0.0, "grad_norm": 0.1141011118888855, "learning_rate": 1.96e-06, "loss": 0.0002, "num_tokens": 431255.0, "reward": 3.954058885574341, "reward_std": 9.64480972290039, "rewards/fitness_reward/mean": 2.8021559715270996, "rewards/fitness_reward/std": 5.311481952667236, "rewards/kidney_reward/mean": 0.5934000611305237, "rewards/kidney_reward/std": 2.2488973140716553, "rewards/length2tails_reward/mean": 0.5961021184921265, "rewards/length2tails_reward/std": 0.4461689591407776, "rewards/repeated_in_batch_reward/mean": 0.90625, "rewards/repeated_in_batch_reward/std": 0.2961445748806, "rewards/thermo_reward/mean": 0.4082678556442261, "rewards/thermo_reward/std": 2.953021764755249, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.34375, "completions/mean_terminated_length": 269.34375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.08916603773832321, "epoch": 0.102, "frac_reward_zero_std": 0.0, "grad_norm": 0.14698363840579987, "learning_rate": 2e-06, "loss": -0.0057, "num_tokens": 439906.0, "reward": 7.744524002075195, "reward_std": 7.743293285369873, "rewards/fitness_reward/mean": 4.869905948638916, "rewards/fitness_reward/std": 4.543521404266357, "rewards/kidney_reward/mean": 1.560369849205017, "rewards/kidney_reward/std": 1.8243423700332642, "rewards/length2tails_reward/mean": 0.4840232729911804, "rewards/length2tails_reward/std": 0.41251611709594727, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 1.1720962524414062, "rewards/thermo_reward/std": 2.5674214363098145, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.75, "completions/mean_terminated_length": 270.75, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.09895084332674742, "epoch": 0.104, "frac_reward_zero_std": 0.0, "grad_norm": 0.11244598031044006, "learning_rate": 1.9999991778756307e-06, "loss": 0.0014, "num_tokens": 448602.0, "reward": 8.26731014251709, "reward_std": 9.016767501831055, "rewards/fitness_reward/mean": 4.987358570098877, "rewards/fitness_reward/std": 4.567765235900879, "rewards/kidney_reward/mean": 1.553128957748413, "rewards/kidney_reward/std": 2.018177032470703, "rewards/length2tails_reward/mean": 0.6099748015403748, "rewards/length2tails_reward/std": 0.41677555441856384, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.5658247470855713, "rewards/thermo_reward/std": 2.8165760040283203, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 267.09375, "completions/mean_terminated_length": 267.09375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.09943242277950048, "epoch": 0.106, "frac_reward_zero_std": 0.0, "grad_norm": 0.19205942749977112, "learning_rate": 1.999996711503876e-06, "loss": -0.0187, "num_tokens": 457181.0, "reward": 5.50713586807251, "reward_std": 10.876879692077637, "rewards/fitness_reward/mean": 3.5678153038024902, "rewards/fitness_reward/std": 5.341129302978516, "rewards/kidney_reward/mean": 0.8138814568519592, "rewards/kidney_reward/std": 2.539766550064087, "rewards/length2tails_reward/mean": 0.636076807975769, "rewards/length2tails_reward/std": 0.4049949049949646, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.9618313908576965, "rewards/thermo_reward/std": 3.2641184329986572, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.90625, "completions/mean_terminated_length": 269.90625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09612957295030355, "epoch": 0.108, "frac_reward_zero_std": 0.0, "grad_norm": 0.12145273387432098, "learning_rate": 1.9999926008887906e-06, "loss": 0.0016, "num_tokens": 465850.0, "reward": 5.88297176361084, "reward_std": 9.687450408935547, "rewards/fitness_reward/mean": 3.9562315940856934, "rewards/fitness_reward/std": 5.032125473022461, "rewards/kidney_reward/mean": 0.9145177602767944, "rewards/kidney_reward/std": 2.250870943069458, "rewards/length2tails_reward/mean": 0.5738779902458191, "rewards/length2tails_reward/std": 0.4065060615539551, "rewards/repeated_in_batch_reward/mean": 0.875, "rewards/repeated_in_batch_reward/std": 0.33601075410842896, "rewards/thermo_reward/mean": 0.867334246635437, "rewards/thermo_reward/std": 2.963379383087158, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.53125, "completions/mean_terminated_length": 272.53125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10116562247276306, "epoch": 0.11, "frac_reward_zero_std": 0.0, "grad_norm": 0.07908673584461212, "learning_rate": 1.999986846037133e-06, "loss": 0.0, "num_tokens": 474603.0, "reward": 4.582418441772461, "reward_std": 10.85931396484375, "rewards/fitness_reward/mean": 3.5986382961273193, "rewards/fitness_reward/std": 5.242656707763672, "rewards/kidney_reward/mean": 0.5228749513626099, "rewards/kidney_reward/std": 2.615586042404175, "rewards/length2tails_reward/mean": 0.7722164988517761, "rewards/length2tails_reward/std": 0.31869614124298096, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.28368380665779114, "rewards/thermo_reward/std": 3.4354615211486816, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.3125, "completions/mean_terminated_length": 271.3125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09424130246043205, "epoch": 0.112, "frac_reward_zero_std": 0.0, "grad_norm": 0.07371854037046432, "learning_rate": 1.9999794469583658e-06, "loss": 0.0014, "num_tokens": 483317.0, "reward": 5.236574649810791, "reward_std": 9.785390853881836, "rewards/fitness_reward/mean": 3.810666799545288, "rewards/fitness_reward/std": 5.2446608543396, "rewards/kidney_reward/mean": 0.7944426536560059, "rewards/kidney_reward/std": 2.3587422370910645, "rewards/length2tails_reward/mean": 0.600432813167572, "rewards/length2tails_reward/std": 0.43214908242225647, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.4714217782020569, "rewards/thermo_reward/std": 3.0919981002807617, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 263.65625, "completions/mean_terminated_length": 263.65625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.09555036667734385, "epoch": 0.114, "frac_reward_zero_std": 0.0, "grad_norm": 0.1235174760222435, "learning_rate": 1.9999704036646555e-06, "loss": -0.0246, "num_tokens": 491786.0, "reward": 2.1675093173980713, "reward_std": 11.569999694824219, "rewards/fitness_reward/mean": 2.065021514892578, "rewards/fitness_reward/std": 5.637988090515137, "rewards/kidney_reward/mean": 0.03666689991950989, "rewards/kidney_reward/std": 2.760089635848999, "rewards/length2tails_reward/mean": 0.616014838218689, "rewards/length2tails_reward/std": 0.41217100620269775, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": -0.0895305871963501, "rewards/thermo_reward/std": 3.400143623352051, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.0, "completions/mean_terminated_length": 270.0, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.1021167878061533, "epoch": 0.116, "frac_reward_zero_std": 0.0, "grad_norm": 0.06497107446193695, "learning_rate": 1.999959716170871e-06, "loss": 0.0026, "num_tokens": 500458.0, "reward": 8.043519020080566, "reward_std": 8.370131492614746, "rewards/fitness_reward/mean": 5.48520565032959, "rewards/fitness_reward/std": 4.132047176361084, "rewards/kidney_reward/mean": 1.4747217893600464, "rewards/kidney_reward/std": 2.0179266929626465, "rewards/length2tails_reward/mean": 0.6013509035110474, "rewards/length2tails_reward/std": 0.36498507857322693, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.9234564304351807, "rewards/thermo_reward/std": 2.9822323322296143, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 270.625, "completions/mean_terminated_length": 270.625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08857986517250538, "epoch": 0.118, "frac_reward_zero_std": 0.0, "grad_norm": 0.09607218205928802, "learning_rate": 1.999947384494585e-06, "loss": 0.0064, "num_tokens": 509150.0, "reward": 1.3199851512908936, "reward_std": 10.509997367858887, "rewards/fitness_reward/mean": 2.571333885192871, "rewards/fitness_reward/std": 5.572249889373779, "rewards/kidney_reward/mean": -0.37929677963256836, "rewards/kidney_reward/std": 2.616180658340454, "rewards/length2tails_reward/mean": 0.6478626728057861, "rewards/length2tails_reward/std": 0.4019405245780945, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": -1.0368380546569824, "rewards/thermo_reward/std": 3.0118613243103027, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10882160067558289, "epoch": 0.12, "frac_reward_zero_std": 0.0, "grad_norm": 0.13335488736629486, "learning_rate": 1.9999334086560746e-06, "loss": 0.0025, "num_tokens": 517898.0, "reward": 4.216526985168457, "reward_std": 11.425372123718262, "rewards/fitness_reward/mean": 3.1399548053741455, "rewards/fitness_reward/std": 5.560117244720459, "rewards/kidney_reward/mean": 0.4496838450431824, "rewards/kidney_reward/std": 2.7316980361938477, "rewards/length2tails_reward/mean": 0.739764392375946, "rewards/length2tails_reward/std": 0.348965585231781, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.45291224122047424, "rewards/thermo_reward/std": 3.5019266605377197, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.75, "completions/mean_terminated_length": 270.75, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09633060963824391, "epoch": 0.122, "frac_reward_zero_std": 0.0, "grad_norm": 0.09005337953567505, "learning_rate": 1.999917788678319e-06, "loss": 0.0008, "num_tokens": 526594.0, "reward": 5.742714881896973, "reward_std": 9.801647186279297, "rewards/fitness_reward/mean": 4.097263813018799, "rewards/fitness_reward/std": 5.070271015167236, "rewards/kidney_reward/mean": 0.6791552901268005, "rewards/kidney_reward/std": 2.318540573120117, "rewards/length2tails_reward/mean": 0.6282171607017517, "rewards/length2tails_reward/std": 0.4059312045574188, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.8034740686416626, "rewards/thermo_reward/std": 3.17315411567688, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.875, "completions/mean_terminated_length": 269.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10056066047400236, "epoch": 0.124, "frac_reward_zero_std": 0.0, "grad_norm": 0.09247541427612305, "learning_rate": 1.9999005245870014e-06, "loss": 0.0059, "num_tokens": 535262.0, "reward": 4.459540367126465, "reward_std": 10.471243858337402, "rewards/fitness_reward/mean": 3.3481173515319824, "rewards/fitness_reward/std": 5.301581859588623, "rewards/kidney_reward/mean": 0.5900368094444275, "rewards/kidney_reward/std": 2.4857983589172363, "rewards/length2tails_reward/mean": 0.6027665138244629, "rewards/length2tails_reward/std": 0.40925168991088867, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.3611098527908325, "rewards/thermo_reward/std": 3.0389420986175537, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.75, "completions/mean_terminated_length": 270.75, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09483396168798208, "epoch": 0.126, "frac_reward_zero_std": 0.0, "grad_norm": 0.10048498958349228, "learning_rate": 1.9998816164105085e-06, "loss": 0.0018, "num_tokens": 543958.0, "reward": 5.394386291503906, "reward_std": 9.99219036102295, "rewards/fitness_reward/mean": 4.284843921661377, "rewards/fitness_reward/std": 4.95587682723999, "rewards/kidney_reward/mean": 0.7851080894470215, "rewards/kidney_reward/std": 2.488471031188965, "rewards/length2tails_reward/mean": 0.644648551940918, "rewards/length2tails_reward/std": 0.3986409604549408, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.15996931493282318, "rewards/thermo_reward/std": 3.202000617980957, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.15625, "completions/mean_terminated_length": 270.15625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10331257060170174, "epoch": 0.128, "frac_reward_zero_std": 0.0, "grad_norm": 0.06592488288879395, "learning_rate": 1.99986106417993e-06, "loss": -0.0012, "num_tokens": 552635.0, "reward": 8.70488166809082, "reward_std": 7.780556678771973, "rewards/fitness_reward/mean": 5.549319267272949, "rewards/fitness_reward/std": 4.140100479125977, "rewards/kidney_reward/mean": 1.7368488311767578, "rewards/kidney_reward/std": 1.8308501243591309, "rewards/length2tails_reward/mean": 0.5674126148223877, "rewards/length2tails_reward/std": 0.38003429770469666, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.2619726657867432, "rewards/thermo_reward/std": 2.896923542022705, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.4375, "completions/mean_terminated_length": 269.4375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09276031795889139, "epoch": 0.13, "frac_reward_zero_std": 0.0, "grad_norm": 0.07289103418588638, "learning_rate": 1.999838867929058e-06, "loss": 0.0088, "num_tokens": 561289.0, "reward": 6.259957313537598, "reward_std": 9.348736763000488, "rewards/fitness_reward/mean": 4.764282703399658, "rewards/fitness_reward/std": 4.725064277648926, "rewards/kidney_reward/mean": 0.948445737361908, "rewards/kidney_reward/std": 2.2839484214782715, "rewards/length2tails_reward/mean": 0.5402899980545044, "rewards/length2tails_reward/std": 0.4097994267940521, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.39319971203804016, "rewards/thermo_reward/std": 3.061786651611328, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.0625, "completions/mean_terminated_length": 271.0625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10293789114803076, "epoch": 0.132, "frac_reward_zero_std": 0.0, "grad_norm": 0.07653038948774338, "learning_rate": 1.99981502769439e-06, "loss": 0.0026, "num_tokens": 569995.0, "reward": 3.9891343116760254, "reward_std": 11.183003425598145, "rewards/fitness_reward/mean": 3.2974727153778076, "rewards/fitness_reward/std": 5.498640537261963, "rewards/kidney_reward/mean": 0.4626770615577698, "rewards/kidney_reward/std": 2.654911518096924, "rewards/length2tails_reward/mean": 0.7131531834602356, "rewards/length2tails_reward/std": 0.3393803834915161, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.05766934156417847, "rewards/thermo_reward/std": 3.530383825302124, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 269.96875, "completions/mean_terminated_length": 269.96875, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 0.0999839473515749, "epoch": 0.134, "frac_reward_zero_std": 0.0, "grad_norm": 0.11364952474832535, "learning_rate": 1.9997895435151245e-06, "loss": 0.0032, "num_tokens": 578666.0, "reward": 3.903465747833252, "reward_std": 10.257515907287598, "rewards/fitness_reward/mean": 2.9853196144104004, "rewards/fitness_reward/std": 5.393058776855469, "rewards/kidney_reward/mean": 0.5317540168762207, "rewards/kidney_reward/std": 2.351651906967163, "rewards/length2tails_reward/mean": 0.600960373878479, "rewards/length2tails_reward/std": 0.42705038189888, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 0.23254606127738953, "rewards/thermo_reward/std": 2.9828085899353027, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.03125, "completions/mean_terminated_length": 269.03125, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "entropy": 0.10749497450888157, "epoch": 0.136, "frac_reward_zero_std": 0.0, "grad_norm": 0.21903333067893982, "learning_rate": 1.999762415433164e-06, "loss": -0.0117, "num_tokens": 587307.0, "reward": 6.335038185119629, "reward_std": 9.666593551635742, "rewards/fitness_reward/mean": 4.830661773681641, "rewards/fitness_reward/std": 4.752323150634766, "rewards/kidney_reward/mean": 0.9589738845825195, "rewards/kidney_reward/std": 2.4108433723449707, "rewards/length2tails_reward/mean": 0.5960825085639954, "rewards/length2tails_reward/std": 0.39040085673332214, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.38579437136650085, "rewards/thermo_reward/std": 3.2222890853881836, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.8125, "completions/mean_terminated_length": 270.8125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09495829651132226, "epoch": 0.138, "frac_reward_zero_std": 0.0, "grad_norm": 0.1579008847475052, "learning_rate": 1.9997336434931136e-06, "loss": -0.0066, "num_tokens": 596005.0, "reward": 6.201303958892822, "reward_std": 8.618098258972168, "rewards/fitness_reward/mean": 4.139523506164551, "rewards/fitness_reward/std": 4.9184184074401855, "rewards/kidney_reward/mean": 1.0352199077606201, "rewards/kidney_reward/std": 2.0201306343078613, "rewards/length2tails_reward/mean": 0.5589442253112793, "rewards/length2tails_reward/std": 0.4349207878112793, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 0.876916229724884, "rewards/thermo_reward/std": 2.810779333114624, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.34375, "completions/mean_terminated_length": 270.34375, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 0.09140817727893591, "epoch": 0.14, "frac_reward_zero_std": 0.0, "grad_norm": 0.08873090893030167, "learning_rate": 1.9997032277422817e-06, "loss": 0.0014, "num_tokens": 604688.0, "reward": 4.505502700805664, "reward_std": 9.884005546569824, "rewards/fitness_reward/mean": 3.5307188034057617, "rewards/fitness_reward/std": 5.276813507080078, "rewards/kidney_reward/mean": 0.6191385984420776, "rewards/kidney_reward/std": 2.336310863494873, "rewards/length2tails_reward/mean": 0.6345251202583313, "rewards/length2tails_reward/std": 0.406307190656662, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.19219228625297546, "rewards/thermo_reward/std": 2.8584790229797363, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 268.125, "completions/mean_terminated_length": 268.125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09404969774186611, "epoch": 0.142, "frac_reward_zero_std": 0.0, "grad_norm": 0.07119587808847427, "learning_rate": 1.99967116823068e-06, "loss": 0.0016, "num_tokens": 613300.0, "reward": 8.309608459472656, "reward_std": 7.589566707611084, "rewards/fitness_reward/mean": 5.349855422973633, "rewards/fitness_reward/std": 4.035140514373779, "rewards/kidney_reward/mean": 1.5614449977874756, "rewards/kidney_reward/std": 1.8345295190811157, "rewards/length2tails_reward/mean": 0.4056016802787781, "rewards/length2tails_reward/std": 0.4158850312232971, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.2577483654022217, "rewards/thermo_reward/std": 2.606032371520996, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.375, "completions/mean_terminated_length": 269.375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09486975288018584, "epoch": 0.144, "frac_reward_zero_std": 0.0, "grad_norm": 0.11193745583295822, "learning_rate": 1.999637465011021e-06, "loss": 0.003, "num_tokens": 621952.0, "reward": 5.321290969848633, "reward_std": 8.887402534484863, "rewards/fitness_reward/mean": 4.612102508544922, "rewards/fitness_reward/std": 4.851127624511719, "rewards/kidney_reward/mean": 0.815106213092804, "rewards/kidney_reward/std": 2.2408759593963623, "rewards/length2tails_reward/mean": 0.49875015020370483, "rewards/length2tails_reward/std": 0.42784377932548523, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": -0.2557927072048187, "rewards/thermo_reward/std": 2.849024534225464, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.53125, "completions/mean_terminated_length": 271.53125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.10298145096749067, "epoch": 0.146, "frac_reward_zero_std": 0.0, "grad_norm": 0.09242729842662811, "learning_rate": 1.999602118138722e-06, "loss": -0.0, "num_tokens": 630673.0, "reward": 5.481384754180908, "reward_std": 9.99774169921875, "rewards/fitness_reward/mean": 3.745878219604492, "rewards/fitness_reward/std": 5.336620807647705, "rewards/kidney_reward/mean": 0.8958523869514465, "rewards/kidney_reward/std": 2.3274919986724854, "rewards/length2tails_reward/mean": 0.7147248983383179, "rewards/length2tails_reward/std": 0.33806055784225464, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.6681815981864929, "rewards/thermo_reward/std": 3.090961217880249, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 270.15625, "completions/mean_terminated_length": 270.15625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08604165725409985, "epoch": 0.148, "frac_reward_zero_std": 0.0, "grad_norm": 0.10083026438951492, "learning_rate": 1.9995651276719022e-06, "loss": 0.0028, "num_tokens": 639350.0, "reward": 5.352714538574219, "reward_std": 9.972210884094238, "rewards/fitness_reward/mean": 3.942040205001831, "rewards/fitness_reward/std": 5.167749404907227, "rewards/kidney_reward/mean": 0.5562193393707275, "rewards/kidney_reward/std": 2.3640551567077637, "rewards/length2tails_reward/mean": 0.5871860980987549, "rewards/length2tails_reward/std": 0.436727374792099, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 0.7019863128662109, "rewards/thermo_reward/std": 3.129981517791748, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.96875, "completions/mean_terminated_length": 271.96875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11275890469551086, "epoch": 0.15, "frac_reward_zero_std": 0.0, "grad_norm": 0.08880766481161118, "learning_rate": 1.999526493671383e-06, "loss": 0.002, "num_tokens": 648085.0, "reward": 4.680360794067383, "reward_std": 11.522732734680176, "rewards/fitness_reward/mean": 3.4654369354248047, "rewards/fitness_reward/std": 5.4723615646362305, "rewards/kidney_reward/mean": 0.5316477417945862, "rewards/kidney_reward/std": 2.7512929439544678, "rewards/length2tails_reward/mean": 0.7533819675445557, "rewards/length2tails_reward/std": 0.35159409046173096, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.5079379081726074, "rewards/thermo_reward/std": 3.539987087249756, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.03125, "completions/mean_terminated_length": 270.03125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0990257365629077, "epoch": 0.152, "frac_reward_zero_std": 0.0, "grad_norm": 0.1063062772154808, "learning_rate": 1.999486216200688e-06, "loss": 0.0038, "num_tokens": 656758.0, "reward": 6.651132106781006, "reward_std": 8.796796798706055, "rewards/fitness_reward/mean": 4.565807342529297, "rewards/fitness_reward/std": 4.806469917297363, "rewards/kidney_reward/mean": 1.3196901082992554, "rewards/kidney_reward/std": 2.0117340087890625, "rewards/length2tails_reward/mean": 0.5656614303588867, "rewards/length2tails_reward/std": 0.38031989336013794, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 0.6153182983398438, "rewards/thermo_reward/std": 2.7939960956573486, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.96875, "completions/mean_terminated_length": 270.96875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.1029750956222415, "epoch": 0.154, "frac_reward_zero_std": 0.0, "grad_norm": 0.08091334998607635, "learning_rate": 1.999444295326043e-06, "loss": 0.0002, "num_tokens": 665461.0, "reward": 7.837647438049316, "reward_std": 8.722521781921387, "rewards/fitness_reward/mean": 5.028226375579834, "rewards/fitness_reward/std": 4.435754776000977, "rewards/kidney_reward/mean": 1.4896342754364014, "rewards/kidney_reward/std": 1.9922144412994385, "rewards/length2tails_reward/mean": 0.6519697904586792, "rewards/length2tails_reward/std": 0.3789510726928711, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.1545900106430054, "rewards/thermo_reward/std": 2.942502737045288, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.3125, "completions/mean_terminated_length": 270.3125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0930614871904254, "epoch": 0.156, "frac_reward_zero_std": 0.0, "grad_norm": 0.10220735520124435, "learning_rate": 1.9994007311163777e-06, "loss": 0.0018, "num_tokens": 674143.0, "reward": 5.842037677764893, "reward_std": 9.24101734161377, "rewards/fitness_reward/mean": 4.47142219543457, "rewards/fitness_reward/std": 4.9697465896606445, "rewards/kidney_reward/mean": 1.0660680532455444, "rewards/kidney_reward/std": 2.2489302158355713, "rewards/length2tails_reward/mean": 0.56805419921875, "rewards/length2tails_reward/std": 0.416465163230896, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.14774230122566223, "rewards/thermo_reward/std": 3.289302349090576, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.875, "completions/mean_terminated_length": 269.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09572804998606443, "epoch": 0.158, "frac_reward_zero_std": 0.0, "grad_norm": 0.07806969434022903, "learning_rate": 1.999355523643321e-06, "loss": 0.0031, "num_tokens": 682811.0, "reward": 4.878722667694092, "reward_std": 10.568361282348633, "rewards/fitness_reward/mean": 3.5764052867889404, "rewards/fitness_reward/std": 5.329973220825195, "rewards/kidney_reward/mean": 0.6331483125686646, "rewards/kidney_reward/std": 2.4987599849700928, "rewards/length2tails_reward/mean": 0.5859672427177429, "rewards/length2tails_reward/std": 0.4179089665412903, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 0.5168222188949585, "rewards/thermo_reward/std": 3.341632127761841, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.1875, "completions/mean_terminated_length": 269.1875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.1038662726059556, "epoch": 0.16, "frac_reward_zero_std": 0.0, "grad_norm": 0.10314842313528061, "learning_rate": 1.9993086729812065e-06, "loss": 0.0029, "num_tokens": 691457.0, "reward": 7.508264541625977, "reward_std": 8.252596855163574, "rewards/fitness_reward/mean": 5.147569179534912, "rewards/fitness_reward/std": 4.439197063446045, "rewards/kidney_reward/mean": 1.4739444255828857, "rewards/kidney_reward/std": 1.952415943145752, "rewards/length2tails_reward/mean": 0.48841190338134766, "rewards/length2tails_reward/std": 0.42245423793792725, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.7379104495048523, "rewards/thermo_reward/std": 2.9669361114501953, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.21875, "completions/mean_terminated_length": 271.21875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09912833292037249, "epoch": 0.162, "frac_reward_zero_std": 0.0, "grad_norm": 0.09835723787546158, "learning_rate": 1.9992601792070675e-06, "loss": 0.0019, "num_tokens": 700168.0, "reward": 5.911773681640625, "reward_std": 10.059379577636719, "rewards/fitness_reward/mean": 3.8010964393615723, "rewards/fitness_reward/std": 5.367618083953857, "rewards/kidney_reward/mean": 1.019120454788208, "rewards/kidney_reward/std": 2.3711156845092773, "rewards/length2tails_reward/mean": 0.6711513996124268, "rewards/length2tails_reward/std": 0.38624030351638794, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.9244422912597656, "rewards/thermo_reward/std": 3.0485928058624268, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 262.3125, "completions/mean_terminated_length": 262.3125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.09261684585362673, "epoch": 0.164, "frac_reward_zero_std": 0.0, "grad_norm": 0.302977591753006, "learning_rate": 1.9992100424006403e-06, "loss": -0.0424, "num_tokens": 708594.0, "reward": 5.575660705566406, "reward_std": 9.62802505493164, "rewards/fitness_reward/mean": 4.478847503662109, "rewards/fitness_reward/std": 4.956948280334473, "rewards/kidney_reward/mean": 0.8628518581390381, "rewards/kidney_reward/std": 2.399749994277954, "rewards/length2tails_reward/mean": 0.499155730009079, "rewards/length2tails_reward/std": 0.4330158531665802, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.08404579013586044, "rewards/thermo_reward/std": 3.165400981903076, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.125, "completions/mean_terminated_length": 273.125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11235030833631754, "epoch": 0.166, "frac_reward_zero_std": 0.0, "grad_norm": 0.1042870506644249, "learning_rate": 1.9991582626443615e-06, "loss": -0.0035, "num_tokens": 717366.0, "reward": 5.495595932006836, "reward_std": 9.901491165161133, "rewards/fitness_reward/mean": 4.186452865600586, "rewards/fitness_reward/std": 5.04417085647583, "rewards/kidney_reward/mean": 0.7878950834274292, "rewards/kidney_reward/std": 2.339024066925049, "rewards/length2tails_reward/mean": 0.8301782608032227, "rewards/length2tails_reward/std": 0.2957022190093994, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.3382304012775421, "rewards/thermo_reward/std": 3.0513925552368164, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.625, "completions/mean_terminated_length": 270.625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10217729490250349, "epoch": 0.168, "frac_reward_zero_std": 0.0, "grad_norm": 0.21394874155521393, "learning_rate": 1.999104840023371e-06, "loss": -0.0013, "num_tokens": 726058.0, "reward": 8.809710502624512, "reward_std": 7.709327697753906, "rewards/fitness_reward/mean": 5.823422431945801, "rewards/fitness_reward/std": 3.6806018352508545, "rewards/kidney_reward/mean": 1.486276388168335, "rewards/kidney_reward/std": 1.9127298593521118, "rewards/length2tails_reward/mean": 0.6566610336303711, "rewards/length2tails_reward/std": 0.3470841646194458, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.3343451023101807, "rewards/thermo_reward/std": 2.83933162689209, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.3125, "completions/mean_terminated_length": 271.3125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09568210691213608, "epoch": 0.17, "frac_reward_zero_std": 0.0, "grad_norm": 0.10421138256788254, "learning_rate": 1.999049774625508e-06, "loss": -0.0004, "num_tokens": 734772.0, "reward": 7.292887210845947, "reward_std": 9.549410820007324, "rewards/fitness_reward/mean": 4.795578479766846, "rewards/fitness_reward/std": 4.69606876373291, "rewards/kidney_reward/mean": 1.1542949676513672, "rewards/kidney_reward/std": 2.3051979541778564, "rewards/length2tails_reward/mean": 0.6634117364883423, "rewards/length2tails_reward/std": 0.41317665576934814, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.1766724586486816, "rewards/thermo_reward/std": 2.9498138427734375, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.71875, "completions/mean_terminated_length": 269.71875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09443402104079723, "epoch": 0.172, "frac_reward_zero_std": 0.0, "grad_norm": 0.10363943129777908, "learning_rate": 1.9989930665413145e-06, "loss": 0.0024, "num_tokens": 743435.0, "reward": 8.326192855834961, "reward_std": 8.09758472442627, "rewards/fitness_reward/mean": 5.624570846557617, "rewards/fitness_reward/std": 4.108580112457275, "rewards/kidney_reward/mean": 1.6422455310821533, "rewards/kidney_reward/std": 1.912664771080017, "rewards/length2tails_reward/mean": 0.5016780495643616, "rewards/length2tails_reward/std": 0.43905147910118103, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.9092094898223877, "rewards/thermo_reward/std": 2.8582024574279785, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.15625, "completions/mean_terminated_length": 270.15625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.095752764493227, "epoch": 0.174, "frac_reward_zero_std": 0.0, "grad_norm": 0.07105634361505508, "learning_rate": 1.9989347158640323e-06, "loss": 0.0005, "num_tokens": 752112.0, "reward": 6.623435020446777, "reward_std": 9.345142364501953, "rewards/fitness_reward/mean": 4.6255388259887695, "rewards/fitness_reward/std": 4.729114532470703, "rewards/kidney_reward/mean": 0.9081051349639893, "rewards/kidney_reward/std": 2.3178250789642334, "rewards/length2tails_reward/mean": 0.6243544816970825, "rewards/length2tails_reward/std": 0.39751023054122925, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.9273550510406494, "rewards/thermo_reward/std": 3.172642469406128, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.34375, "completions/mean_terminated_length": 270.34375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09468840854242444, "epoch": 0.176, "frac_reward_zero_std": 0.0, "grad_norm": 0.10832744091749191, "learning_rate": 1.998874722689604e-06, "loss": 0.0005, "num_tokens": 760795.0, "reward": 5.828799724578857, "reward_std": 10.804570198059082, "rewards/fitness_reward/mean": 3.6554083824157715, "rewards/fitness_reward/std": 5.36220645904541, "rewards/kidney_reward/mean": 0.8743642568588257, "rewards/kidney_reward/std": 2.57539701461792, "rewards/length2tails_reward/mean": 0.5887205600738525, "rewards/length2tails_reward/std": 0.4141828119754791, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.14015531539917, "rewards/thermo_reward/std": 3.175602674484253, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.53125, "completions/mean_terminated_length": 271.53125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10456154402345419, "epoch": 0.178, "frac_reward_zero_std": 0.0, "grad_norm": 0.07037483900785446, "learning_rate": 1.9988130871166737e-06, "loss": 0.0016, "num_tokens": 769516.0, "reward": 5.9289679527282715, "reward_std": 9.64340877532959, "rewards/fitness_reward/mean": 4.207990646362305, "rewards/fitness_reward/std": 5.013721466064453, "rewards/kidney_reward/mean": 0.8479658365249634, "rewards/kidney_reward/std": 2.3798933029174805, "rewards/length2tails_reward/mean": 0.6725878119468689, "rewards/length2tails_reward/std": 0.40477490425109863, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.7057527899742126, "rewards/thermo_reward/std": 3.074697494506836, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.53125, "completions/mean_terminated_length": 269.53125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09975282941013575, "epoch": 0.18, "frac_reward_zero_std": 0.0, "grad_norm": 0.12680593132972717, "learning_rate": 1.9987498092465854e-06, "loss": 0.0013, "num_tokens": 778173.0, "reward": 6.914088249206543, "reward_std": 8.010117530822754, "rewards/fitness_reward/mean": 4.84921932220459, "rewards/fitness_reward/std": 4.466445446014404, "rewards/kidney_reward/mean": 1.2208207845687866, "rewards/kidney_reward/std": 1.968360185623169, "rewards/length2tails_reward/mean": 0.5152795910835266, "rewards/length2tails_reward/std": 0.41995134949684143, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 0.6987695097923279, "rewards/thermo_reward/std": 2.7424521446228027, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.71875, "completions/mean_terminated_length": 269.71875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.09674192778766155, "epoch": 0.182, "frac_reward_zero_std": 0.0, "grad_norm": 0.07509780675172806, "learning_rate": 1.9986848891833845e-06, "loss": 0.0024, "num_tokens": 786836.0, "reward": 8.015886306762695, "reward_std": 8.986359596252441, "rewards/fitness_reward/mean": 4.971525192260742, "rewards/fitness_reward/std": 4.600165843963623, "rewards/kidney_reward/mean": 1.5063060522079468, "rewards/kidney_reward/std": 1.9875105619430542, "rewards/length2tails_reward/mean": 0.5797788500785828, "rewards/length2tails_reward/std": 0.37075045704841614, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.3800766468048096, "rewards/thermo_reward/std": 2.7960541248321533, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.0625, "completions/mean_terminated_length": 272.0625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10261530056595802, "epoch": 0.184, "frac_reward_zero_std": 0.0, "grad_norm": 0.21921300888061523, "learning_rate": 1.9986183270338147e-06, "loss": 0.004, "num_tokens": 795574.0, "reward": 7.21077823638916, "reward_std": 9.365437507629395, "rewards/fitness_reward/mean": 4.806048393249512, "rewards/fitness_reward/std": 4.652261257171631, "rewards/kidney_reward/mean": 1.1478865146636963, "rewards/kidney_reward/std": 2.238415241241455, "rewards/length2tails_reward/mean": 0.6891014575958252, "rewards/length2tails_reward/std": 0.36245107650756836, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.087932825088501, "rewards/thermo_reward/std": 2.9496536254882812, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.375, "completions/mean_terminated_length": 270.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10041815415024757, "epoch": 0.186, "frac_reward_zero_std": 0.0, "grad_norm": 0.061401546001434326, "learning_rate": 1.998550122907321e-06, "loss": 0.0003, "num_tokens": 804258.0, "reward": 8.16575813293457, "reward_std": 8.563042640686035, "rewards/fitness_reward/mean": 5.563236236572266, "rewards/fitness_reward/std": 4.104979991912842, "rewards/kidney_reward/mean": 1.351224660873413, "rewards/kidney_reward/std": 2.202674627304077, "rewards/length2tails_reward/mean": 0.6049997806549072, "rewards/length2tails_reward/std": 0.3989236354827881, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.0907974243164062, "rewards/thermo_reward/std": 3.0085599422454834, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 270.59375, "completions/mean_terminated_length": 270.59375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10706782713532448, "epoch": 0.188, "frac_reward_zero_std": 0.0, "grad_norm": 0.16338106989860535, "learning_rate": 1.998480276916048e-06, "loss": -0.0037, "num_tokens": 812949.0, "reward": 7.739925384521484, "reward_std": 9.26419734954834, "rewards/fitness_reward/mean": 4.661970138549805, "rewards/fitness_reward/std": 4.768324851989746, "rewards/kidney_reward/mean": 1.5202274322509766, "rewards/kidney_reward/std": 2.156172037124634, "rewards/length2tails_reward/mean": 0.6187995672225952, "rewards/length2tails_reward/std": 0.3834105432033539, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.3958477973937988, "rewards/thermo_reward/std": 2.8934850692749023, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 264.25, "completions/mean_terminated_length": 264.25, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.11393436696380377, "epoch": 0.19, "frac_reward_zero_std": 0.0, "grad_norm": 0.27784472703933716, "learning_rate": 1.99840878917484e-06, "loss": -0.0679, "num_tokens": 821437.0, "reward": 9.508270263671875, "reward_std": 8.273795127868652, "rewards/fitness_reward/mean": 5.799304962158203, "rewards/fitness_reward/std": 3.9007155895233154, "rewards/kidney_reward/mean": 1.7468581199645996, "rewards/kidney_reward/std": 2.057310104370117, "rewards/length2tails_reward/mean": 0.6669470071792603, "rewards/length2tails_reward/std": 0.3571520447731018, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.7954118251800537, "rewards/thermo_reward/std": 2.665423631668091, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.59375, "completions/mean_terminated_length": 269.59375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.08702652528882027, "epoch": 0.192, "frac_reward_zero_std": 0.0, "grad_norm": 0.08435554802417755, "learning_rate": 1.998335659801241e-06, "loss": 0.0049, "num_tokens": 830096.0, "reward": 4.850047588348389, "reward_std": 9.819940567016602, "rewards/fitness_reward/mean": 3.855299949645996, "rewards/fitness_reward/std": 5.291053295135498, "rewards/kidney_reward/mean": 0.6495881080627441, "rewards/kidney_reward/std": 2.2953221797943115, "rewards/length2tails_reward/mean": 0.571954607963562, "rewards/length2tails_reward/std": 0.40634799003601074, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.18796436488628387, "rewards/thermo_reward/std": 3.090845823287964, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.375, "completions/mean_terminated_length": 269.375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.08522860752418637, "epoch": 0.194, "frac_reward_zero_std": 0.0, "grad_norm": 0.1303863376379013, "learning_rate": 1.9982608889154927e-06, "loss": 0.0034, "num_tokens": 838748.0, "reward": 4.834753036499023, "reward_std": 10.436513900756836, "rewards/fitness_reward/mean": 3.685192584991455, "rewards/fitness_reward/std": 5.200677394866943, "rewards/kidney_reward/mean": 0.5920010805130005, "rewards/kidney_reward/std": 2.409742593765259, "rewards/length2tails_reward/mean": 0.5136910676956177, "rewards/length2tails_reward/std": 0.4063299596309662, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.4061906337738037, "rewards/thermo_reward/std": 3.287951946258545, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 269.59375, "completions/mean_terminated_length": 269.59375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09816279914230108, "epoch": 0.196, "frac_reward_zero_std": 0.0, "grad_norm": 0.09529352188110352, "learning_rate": 1.9981844766405384e-06, "loss": 0.0006, "num_tokens": 847407.0, "reward": 6.227548599243164, "reward_std": 8.884981155395508, "rewards/fitness_reward/mean": 4.00059700012207, "rewards/fitness_reward/std": 5.081515312194824, "rewards/kidney_reward/mean": 1.2143152952194214, "rewards/kidney_reward/std": 2.036752462387085, "rewards/length2tails_reward/mean": 0.5433864593505859, "rewards/length2tails_reward/std": 0.42335325479507446, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.8582972884178162, "rewards/thermo_reward/std": 2.740398406982422, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.625, "completions/mean_terminated_length": 271.625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10271472111344337, "epoch": 0.198, "frac_reward_zero_std": 0.0, "grad_norm": 0.09906383603811264, "learning_rate": 1.998106423102018e-06, "loss": 0.0005, "num_tokens": 856131.0, "reward": 6.8593549728393555, "reward_std": 10.270197868347168, "rewards/fitness_reward/mean": 3.989349603652954, "rewards/fitness_reward/std": 5.256290435791016, "rewards/kidney_reward/mean": 1.1595712900161743, "rewards/kidney_reward/std": 2.4321062564849854, "rewards/length2tails_reward/mean": 0.6354026794433594, "rewards/length2tails_reward/std": 0.4248734414577484, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.5468939542770386, "rewards/thermo_reward/std": 3.099806070327759, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.65625, "completions/mean_terminated_length": 270.65625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10207742638885975, "epoch": 0.2, "frac_reward_zero_std": 0.0, "grad_norm": 0.09420901536941528, "learning_rate": 1.9980267284282714e-06, "loss": 0.0026, "num_tokens": 864824.0, "reward": 7.539562225341797, "reward_std": 9.378880500793457, "rewards/fitness_reward/mean": 5.152955055236816, "rewards/fitness_reward/std": 4.533102035522461, "rewards/kidney_reward/mean": 1.27552330493927, "rewards/kidney_reward/std": 2.2748749256134033, "rewards/length2tails_reward/mean": 0.6058512330055237, "rewards/length2tails_reward/std": 0.38518333435058594, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.9504985809326172, "rewards/thermo_reward/std": 3.0656003952026367, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.375, "completions/mean_terminated_length": 270.375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09604429174214602, "epoch": 0.202, "frac_reward_zero_std": 0.0, "grad_norm": 0.07431904971599579, "learning_rate": 1.9979453927503364e-06, "loss": -0.0, "num_tokens": 873508.0, "reward": 8.550683975219727, "reward_std": 8.27658748626709, "rewards/fitness_reward/mean": 5.503734588623047, "rewards/fitness_reward/std": 4.0965447425842285, "rewards/kidney_reward/mean": 1.611420750617981, "rewards/kidney_reward/std": 2.0140531063079834, "rewards/length2tails_reward/mean": 0.5655707120895386, "rewards/length2tails_reward/std": 0.413197785615921, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.2789708375930786, "rewards/thermo_reward/std": 2.9564778804779053, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 265.5, "completions/mean_terminated_length": 265.5, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.10650088731199503, "epoch": 0.204, "frac_reward_zero_std": 0.0, "grad_norm": 0.22565796971321106, "learning_rate": 1.9978624162019487e-06, "loss": -0.0312, "num_tokens": 882036.0, "reward": 4.945562839508057, "reward_std": 9.923328399658203, "rewards/fitness_reward/mean": 3.839808464050293, "rewards/fitness_reward/std": 5.21298360824585, "rewards/kidney_reward/mean": 0.5858813524246216, "rewards/kidney_reward/std": 2.3432159423828125, "rewards/length2tails_reward/mean": 0.5365294218063354, "rewards/length2tails_reward/std": 0.4102689325809479, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 0.3724702298641205, "rewards/thermo_reward/std": 3.0979273319244385, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 269.3125, "completions/mean_terminated_length": 269.3125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09324608091264963, "epoch": 0.206, "frac_reward_zero_std": 0.0, "grad_norm": 0.07994762808084488, "learning_rate": 1.9977777989195427e-06, "loss": 0.0016, "num_tokens": 890686.0, "reward": 7.803867816925049, "reward_std": 8.779563903808594, "rewards/fitness_reward/mean": 4.832815170288086, "rewards/fitness_reward/std": 4.733124256134033, "rewards/kidney_reward/mean": 1.5026731491088867, "rewards/kidney_reward/std": 1.9731096029281616, "rewards/length2tails_reward/mean": 0.5229619741439819, "rewards/length2tails_reward/std": 0.3595113158226013, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.3160828351974487, "rewards/thermo_reward/std": 2.8773632049560547, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.3125, "completions/mean_terminated_length": 270.3125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09143533185124397, "epoch": 0.208, "frac_reward_zero_std": 0.0, "grad_norm": 0.15814386308193207, "learning_rate": 1.99769154104225e-06, "loss": -0.0006, "num_tokens": 899368.0, "reward": 6.416100978851318, "reward_std": 10.01828670501709, "rewards/fitness_reward/mean": 3.8951594829559326, "rewards/fitness_reward/std": 5.236571788787842, "rewards/kidney_reward/mean": 1.1687991619110107, "rewards/kidney_reward/std": 2.2653310298919678, "rewards/length2tails_reward/mean": 0.5847136974334717, "rewards/length2tails_reward/std": 0.3893841505050659, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.1936709880828857, "rewards/thermo_reward/std": 2.943660020828247, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.84375, "completions/mean_terminated_length": 270.84375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10562636330723763, "epoch": 0.21, "frac_reward_zero_std": 0.0, "grad_norm": 0.07695178687572479, "learning_rate": 1.9976036427119007e-06, "loss": -0.001, "num_tokens": 908067.0, "reward": 8.14999771118164, "reward_std": 8.205320358276367, "rewards/fitness_reward/mean": 5.517228126525879, "rewards/fitness_reward/std": 4.090753078460693, "rewards/kidney_reward/mean": 1.4921324253082275, "rewards/kidney_reward/std": 2.0499107837677, "rewards/length2tails_reward/mean": 0.61553955078125, "rewards/length2tails_reward/std": 0.39413321018218994, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.9790826439857483, "rewards/thermo_reward/std": 2.7510807514190674, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 271.25, "completions/mean_terminated_length": 271.25, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10066411457955837, "epoch": 0.212, "frac_reward_zero_std": 0.0, "grad_norm": 0.22739523649215698, "learning_rate": 1.9975141040730204e-06, "loss": 0.0031, "num_tokens": 916779.0, "reward": 4.442440509796143, "reward_std": 10.857921600341797, "rewards/fitness_reward/mean": 3.392305850982666, "rewards/fitness_reward/std": 5.459863185882568, "rewards/kidney_reward/mean": 0.45278146862983704, "rewards/kidney_reward/std": 2.604245185852051, "rewards/length2tails_reward/mean": 0.7072968482971191, "rewards/length2tails_reward/std": 0.3663152754306793, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.426623672246933, "rewards/thermo_reward/std": 3.39660382270813, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 270.09375, "completions/mean_terminated_length": 270.09375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10727217141538858, "epoch": 0.214, "frac_reward_zero_std": 0.0, "grad_norm": 0.11362366378307343, "learning_rate": 1.997422925272834e-06, "loss": 0.0006, "num_tokens": 925454.0, "reward": 7.628767013549805, "reward_std": 8.385255813598633, "rewards/fitness_reward/mean": 5.151345252990723, "rewards/fitness_reward/std": 4.295953750610352, "rewards/kidney_reward/mean": 1.4122976064682007, "rewards/kidney_reward/std": 1.9665799140930176, "rewards/length2tails_reward/mean": 0.5739939212799072, "rewards/length2tails_reward/std": 0.38083434104919434, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.9077249765396118, "rewards/thermo_reward/std": 2.839327096939087, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 266.6875, "completions/mean_terminated_length": 266.6875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.09353059530258179, "epoch": 0.216, "frac_reward_zero_std": 0.0, "grad_norm": 0.2610819637775421, "learning_rate": 1.997330106461261e-06, "loss": -0.028, "num_tokens": 934020.0, "reward": 5.741405010223389, "reward_std": 10.462055206298828, "rewards/fitness_reward/mean": 4.127673625946045, "rewards/fitness_reward/std": 5.146501541137695, "rewards/kidney_reward/mean": 0.7454001903533936, "rewards/kidney_reward/std": 2.6007983684539795, "rewards/length2tails_reward/mean": 0.605473518371582, "rewards/length2tails_reward/std": 0.39351916313171387, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.7077840566635132, "rewards/thermo_reward/std": 3.408210277557373, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 281.40625, "completions/mean_terminated_length": 281.40625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10695363953709602, "epoch": 0.218, "frac_reward_zero_std": 0.0, "grad_norm": 0.6681356430053711, "learning_rate": 1.9972356477909203e-06, "loss": 0.0724, "num_tokens": 943057.0, "reward": 6.142980575561523, "reward_std": 10.004242897033691, "rewards/fitness_reward/mean": 4.553394317626953, "rewards/fitness_reward/std": 4.9470624923706055, "rewards/kidney_reward/mean": 1.0444412231445312, "rewards/kidney_reward/std": 2.3431780338287354, "rewards/length2tails_reward/mean": 0.6724941730499268, "rewards/length2tails_reward/std": 0.36398619413375854, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.37789565324783325, "rewards/thermo_reward/std": 3.2467703819274902, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.25, "completions/mean_terminated_length": 269.25, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 0.0894242450594902, "epoch": 0.22, "frac_reward_zero_std": 0.0, "grad_norm": 0.08553007990121841, "learning_rate": 1.997139549417124e-06, "loss": 0.0058, "num_tokens": 951705.0, "reward": 7.994678020477295, "reward_std": 8.695246696472168, "rewards/fitness_reward/mean": 5.522009372711182, "rewards/fitness_reward/std": 4.194775581359863, "rewards/kidney_reward/mean": 1.1783056259155273, "rewards/kidney_reward/std": 2.219749927520752, "rewards/length2tails_reward/mean": 0.5040951371192932, "rewards/length2tails_reward/std": 0.41517889499664307, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.1439536809921265, "rewards/thermo_reward/std": 3.077824354171753, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.375, "completions/mean_terminated_length": 269.375, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "entropy": 0.09392447862774134, "epoch": 0.222, "frac_reward_zero_std": 0.0, "grad_norm": 0.2115517109632492, "learning_rate": 1.9970418114978816e-06, "loss": -0.003, "num_tokens": 960357.0, "reward": 6.93775749206543, "reward_std": 9.360590934753418, "rewards/fitness_reward/mean": 4.801181793212891, "rewards/fitness_reward/std": 4.678520202636719, "rewards/kidney_reward/mean": 1.1626392602920532, "rewards/kidney_reward/std": 2.380502939224243, "rewards/length2tails_reward/mean": 0.5976717472076416, "rewards/length2tails_reward/std": 0.37266412377357483, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 0.82041996717453, "rewards/thermo_reward/std": 2.931457042694092, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 276.625, "completions/mean_terminated_length": 276.625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.1262630559504032, "epoch": 0.224, "frac_reward_zero_std": 0.0, "grad_norm": 1.7861679792404175, "learning_rate": 1.9969424341938997e-06, "loss": 0.0888, "num_tokens": 969241.0, "reward": 9.52402114868164, "reward_std": 7.2843241691589355, "rewards/fitness_reward/mean": 5.664241790771484, "rewards/fitness_reward/std": 4.035696506500244, "rewards/kidney_reward/mean": 1.9477050304412842, "rewards/kidney_reward/std": 1.6191411018371582, "rewards/length2tails_reward/mean": 0.4688212275505066, "rewards/length2tails_reward/std": 0.39938884973526, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 1.7714415788650513, "rewards/thermo_reward/std": 2.5693249702453613, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.9375, "completions/mean_terminated_length": 271.9375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10180703271180391, "epoch": 0.226, "frac_reward_zero_std": 0.0, "grad_norm": 0.10458873957395554, "learning_rate": 1.9968414176685787e-06, "loss": -0.0011, "num_tokens": 977975.0, "reward": 5.531233787536621, "reward_std": 10.222626686096191, "rewards/fitness_reward/mean": 3.81538724899292, "rewards/fitness_reward/std": 5.117627143859863, "rewards/kidney_reward/mean": 0.6674569845199585, "rewards/kidney_reward/std": 2.389415740966797, "rewards/length2tails_reward/mean": 0.6948447227478027, "rewards/length2tails_reward/std": 0.3781306743621826, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.8789056539535522, "rewards/thermo_reward/std": 3.083277463912964, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.90625, "completions/mean_terminated_length": 269.90625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09896293561905622, "epoch": 0.228, "frac_reward_zero_std": 0.0, "grad_norm": 0.0679410845041275, "learning_rate": 1.9967387620880144e-06, "loss": 0.0011, "num_tokens": 986644.0, "reward": 8.611519813537598, "reward_std": 8.677281379699707, "rewards/fitness_reward/mean": 5.582242012023926, "rewards/fitness_reward/std": 4.200404167175293, "rewards/kidney_reward/mean": 1.5517706871032715, "rewards/kidney_reward/std": 2.0585439205169678, "rewards/length2tails_reward/mean": 0.5672687292098999, "rewards/length2tails_reward/std": 0.39586639404296875, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.3207803964614868, "rewards/thermo_reward/std": 2.873594045639038, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 268.625, "completions/mean_terminated_length": 268.625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09147355891764164, "epoch": 0.23, "frac_reward_zero_std": 0.0, "grad_norm": 0.121434785425663, "learning_rate": 1.996634467620999e-06, "loss": -0.0005, "num_tokens": 995272.0, "reward": 6.599301815032959, "reward_std": 9.238264083862305, "rewards/fitness_reward/mean": 4.2816853523254395, "rewards/fitness_reward/std": 4.801488876342773, "rewards/kidney_reward/mean": 1.1353163719177246, "rewards/kidney_reward/std": 2.170203685760498, "rewards/length2tails_reward/mean": 0.47178003191947937, "rewards/length2tails_reward/std": 0.4033353328704834, "rewards/repeated_in_batch_reward/mean": 0.875, "rewards/repeated_in_batch_reward/std": 0.33601075410842896, "rewards/thermo_reward/mean": 1.0476219654083252, "rewards/thermo_reward/std": 2.811617612838745, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 265.3125, "completions/mean_terminated_length": 265.3125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.12102366890758276, "epoch": 0.232, "frac_reward_zero_std": 0.0, "grad_norm": 0.42420950531959534, "learning_rate": 1.9965285344390183e-06, "loss": -0.0484, "num_tokens": 1003794.0, "reward": 6.1183061599731445, "reward_std": 9.711209297180176, "rewards/fitness_reward/mean": 4.594058036804199, "rewards/fitness_reward/std": 4.714536190032959, "rewards/kidney_reward/mean": 0.8576416373252869, "rewards/kidney_reward/std": 2.442807674407959, "rewards/length2tails_reward/mean": 0.6816294193267822, "rewards/length2tails_reward/std": 0.38159894943237305, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.498443603515625, "rewards/thermo_reward/std": 3.1422343254089355, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.90625, "completions/mean_terminated_length": 269.90625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09542591776698828, "epoch": 0.234, "frac_reward_zero_std": 0.0, "grad_norm": 0.17690907418727875, "learning_rate": 1.9964209627162524e-06, "loss": 0.0051, "num_tokens": 1012463.0, "reward": 6.564347743988037, "reward_std": 9.992843627929688, "rewards/fitness_reward/mean": 4.48134183883667, "rewards/fitness_reward/std": 4.829487323760986, "rewards/kidney_reward/mean": 0.9403144121170044, "rewards/kidney_reward/std": 2.4145545959472656, "rewards/length2tails_reward/mean": 0.5347142219543457, "rewards/length2tails_reward/std": 0.41005927324295044, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.9892200827598572, "rewards/thermo_reward/std": 3.2281882762908936, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.6875, "completions/mean_terminated_length": 270.6875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09838891867548227, "epoch": 0.236, "frac_reward_zero_std": 0.0, "grad_norm": 0.09169985353946686, "learning_rate": 1.996311752629576e-06, "loss": 0.0048, "num_tokens": 1021157.0, "reward": 7.520411491394043, "reward_std": 9.169465065002441, "rewards/fitness_reward/mean": 5.126192092895508, "rewards/fitness_reward/std": 4.452926158905029, "rewards/kidney_reward/mean": 1.2386646270751953, "rewards/kidney_reward/std": 2.297891855239868, "rewards/length2tails_reward/mean": 0.6078680753707886, "rewards/length2tails_reward/std": 0.39115339517593384, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.9947676062583923, "rewards/thermo_reward/std": 2.9921579360961914, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 268.84375, "completions/mean_terminated_length": 268.84375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09704151190817356, "epoch": 0.238, "frac_reward_zero_std": 0.0, "grad_norm": 0.0968693345785141, "learning_rate": 1.9962009043585583e-06, "loss": 0.0018, "num_tokens": 1029792.0, "reward": 7.006888389587402, "reward_std": 8.42821979522705, "rewards/fitness_reward/mean": 5.212890148162842, "rewards/fitness_reward/std": 4.411964416503906, "rewards/kidney_reward/mean": 1.0951461791992188, "rewards/kidney_reward/std": 2.16939640045166, "rewards/length2tails_reward/mean": 0.4964791536331177, "rewards/length2tails_reward/std": 0.36847445368766785, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.5492041707038879, "rewards/thermo_reward/std": 3.0719711780548096, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.625, "completions/mean_terminated_length": 269.625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10862219892442226, "epoch": 0.24, "frac_reward_zero_std": 0.0, "grad_norm": 0.08731769770383835, "learning_rate": 1.9960884180854605e-06, "loss": 0.0026, "num_tokens": 1038452.0, "reward": 7.580234050750732, "reward_std": 8.95448112487793, "rewards/fitness_reward/mean": 5.328237533569336, "rewards/fitness_reward/std": 4.111377239227295, "rewards/kidney_reward/mean": 1.0963175296783447, "rewards/kidney_reward/std": 2.2142772674560547, "rewards/length2tails_reward/mean": 0.5319727063179016, "rewards/length2tails_reward/std": 0.4041937291622162, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.0024813413619995, "rewards/thermo_reward/std": 3.4010684490203857, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.71875, "completions/mean_terminated_length": 269.71875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10291161388158798, "epoch": 0.242, "frac_reward_zero_std": 0.0, "grad_norm": 0.13009320199489594, "learning_rate": 1.995974293995239e-06, "loss": -0.0034, "num_tokens": 1047115.0, "reward": 7.1537346839904785, "reward_std": 8.474778175354004, "rewards/fitness_reward/mean": 5.0962724685668945, "rewards/fitness_reward/std": 4.272363662719727, "rewards/kidney_reward/mean": 1.317286491394043, "rewards/kidney_reward/std": 2.1928529739379883, "rewards/length2tails_reward/mean": 0.5597797632217407, "rewards/length2tails_reward/std": 0.38189437985420227, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 0.5904480814933777, "rewards/thermo_reward/std": 2.832634449005127, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 267.1875, "completions/mean_terminated_length": 267.1875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.10709732491523027, "epoch": 0.244, "frac_reward_zero_std": 0.0, "grad_norm": 0.523727536201477, "learning_rate": 1.9958585322755417e-06, "loss": -0.0344, "num_tokens": 1055697.0, "reward": 4.962468147277832, "reward_std": 10.686628341674805, "rewards/fitness_reward/mean": 3.538210868835449, "rewards/fitness_reward/std": 5.383859157562256, "rewards/kidney_reward/mean": 0.7698438763618469, "rewards/kidney_reward/std": 2.587346076965332, "rewards/length2tails_reward/mean": 0.7483822107315063, "rewards/length2tails_reward/std": 0.33493828773498535, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.4795750379562378, "rewards/thermo_reward/std": 3.2550032138824463, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.5, "completions/mean_terminated_length": 269.5, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09620358562096953, "epoch": 0.246, "frac_reward_zero_std": 0.0, "grad_norm": 0.08945641666650772, "learning_rate": 1.9957411331167095e-06, "loss": -0.0016, "num_tokens": 1064353.0, "reward": 8.262110710144043, "reward_std": 8.13237190246582, "rewards/fitness_reward/mean": 5.478936672210693, "rewards/fitness_reward/std": 4.158201694488525, "rewards/kidney_reward/mean": 1.3797082901000977, "rewards/kidney_reward/std": 2.045048236846924, "rewards/length2tails_reward/mean": 0.5142487287521362, "rewards/length2tails_reward/std": 0.4123833477497101, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.2520408630371094, "rewards/thermo_reward/std": 3.0139739513397217, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 268.8125, "completions/mean_terminated_length": 268.8125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09582378715276718, "epoch": 0.248, "frac_reward_zero_std": 0.0, "grad_norm": 0.11037473380565643, "learning_rate": 1.9956220967117754e-06, "loss": -0.0008, "num_tokens": 1072987.0, "reward": 7.818872928619385, "reward_std": 9.085912704467773, "rewards/fitness_reward/mean": 5.416874885559082, "rewards/fitness_reward/std": 4.186379432678223, "rewards/kidney_reward/mean": 1.1898185014724731, "rewards/kidney_reward/std": 2.350635290145874, "rewards/length2tails_reward/mean": 0.5090211629867554, "rewards/length2tails_reward/std": 0.4061613976955414, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.0612778663635254, "rewards/thermo_reward/std": 3.424208164215088, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.40625, "completions/mean_terminated_length": 269.40625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10349469911307096, "epoch": 0.25, "frac_reward_zero_std": 0.0, "grad_norm": 0.2761553227901459, "learning_rate": 1.9955014232564663e-06, "loss": -0.001, "num_tokens": 1081640.0, "reward": 11.220212936401367, "reward_std": 4.9659504890441895, "rewards/fitness_reward/mean": 6.890782356262207, "rewards/fitness_reward/std": 2.0400962829589844, "rewards/kidney_reward/mean": 2.115264415740967, "rewards/kidney_reward/std": 1.3640040159225464, "rewards/length2tails_reward/mean": 0.556138277053833, "rewards/length2tails_reward/std": 0.32990503311157227, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.0585532188415527, "rewards/thermo_reward/std": 2.3020107746124268, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.15625, "completions/mean_terminated_length": 270.15625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10269631631672382, "epoch": 0.252, "frac_reward_zero_std": 0.0, "grad_norm": 0.15978340804576874, "learning_rate": 1.9953791129491983e-06, "loss": -0.0021, "num_tokens": 1090317.0, "reward": 7.519195079803467, "reward_std": 8.24329662322998, "rewards/fitness_reward/mean": 5.177550315856934, "rewards/fitness_reward/std": 4.349887371063232, "rewards/kidney_reward/mean": 1.2797091007232666, "rewards/kidney_reward/std": 2.0155045986175537, "rewards/length2tails_reward/mean": 0.5976382493972778, "rewards/length2tails_reward/std": 0.37005844712257385, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.9021725654602051, "rewards/thermo_reward/std": 2.889704942703247, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.25, "completions/mean_terminated_length": 270.25, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0951996436342597, "epoch": 0.254, "frac_reward_zero_std": 0.0, "grad_norm": 0.0809130147099495, "learning_rate": 1.99525516599108e-06, "loss": 0.0034, "num_tokens": 1098997.0, "reward": 6.134978294372559, "reward_std": 9.807641983032227, "rewards/fitness_reward/mean": 4.49169397354126, "rewards/fitness_reward/std": 4.9330644607543945, "rewards/kidney_reward/mean": 1.0262835025787354, "rewards/kidney_reward/std": 2.369699239730835, "rewards/length2tails_reward/mean": 0.6029974222183228, "rewards/length2tails_reward/std": 0.3860572874546051, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.4567016363143921, "rewards/thermo_reward/std": 3.154005765914917, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 270.09375, "completions/mean_terminated_length": 270.09375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10363792441785336, "epoch": 0.256, "frac_reward_zero_std": 0.0, "grad_norm": 0.2734423875808716, "learning_rate": 1.995129582585911e-06, "loss": -0.0037, "num_tokens": 1107672.0, "reward": 10.065555572509766, "reward_std": 7.323700428009033, "rewards/fitness_reward/mean": 6.216155052185059, "rewards/fitness_reward/std": 3.252866268157959, "rewards/kidney_reward/mean": 1.8607220649719238, "rewards/kidney_reward/std": 1.8087016344070435, "rewards/length2tails_reward/mean": 0.5705130100250244, "rewards/length2tails_reward/std": 0.37505844235420227, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.8316264152526855, "rewards/thermo_reward/std": 2.7449350357055664, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0994394039735198, "epoch": 0.258, "frac_reward_zero_std": 0.0, "grad_norm": 0.06736920028924942, "learning_rate": 1.9950023629401823e-06, "loss": -0.0011, "num_tokens": 1116388.0, "reward": 8.246410369873047, "reward_std": 8.444506645202637, "rewards/fitness_reward/mean": 5.335963726043701, "rewards/fitness_reward/std": 4.293360233306885, "rewards/kidney_reward/mean": 1.3580021858215332, "rewards/kidney_reward/std": 1.9836363792419434, "rewards/length2tails_reward/mean": 0.6779234409332275, "rewards/length2tails_reward/std": 0.35614529252052307, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 1.390902042388916, "rewards/thermo_reward/std": 2.899958372116089, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 265.25, "completions/mean_terminated_length": 265.25, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.10844458639621735, "epoch": 0.26, "frac_reward_zero_std": 0.0, "grad_norm": 0.5157034397125244, "learning_rate": 1.9948735072630743e-06, "loss": -0.0613, "num_tokens": 1124908.0, "reward": 7.14207124710083, "reward_std": 9.764935493469238, "rewards/fitness_reward/mean": 4.8439483642578125, "rewards/fitness_reward/std": 4.8512396812438965, "rewards/kidney_reward/mean": 1.2248245477676392, "rewards/kidney_reward/std": 2.3549630641937256, "rewards/length2tails_reward/mean": 0.6458985805511475, "rewards/length2tails_reward/std": 0.39737337827682495, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.9087082147598267, "rewards/thermo_reward/std": 3.279670000076294, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 270.9375, "completions/mean_terminated_length": 270.9375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10960131324827671, "epoch": 0.262, "frac_reward_zero_std": 0.0, "grad_norm": 0.06332565099000931, "learning_rate": 1.9947430157664573e-06, "loss": -0.0011, "num_tokens": 1133610.0, "reward": 10.147733688354492, "reward_std": 7.310003757476807, "rewards/fitness_reward/mean": 5.934072494506836, "rewards/fitness_reward/std": 3.8383984565734863, "rewards/kidney_reward/mean": 1.8624502420425415, "rewards/kidney_reward/std": 1.6555120944976807, "rewards/length2tails_reward/mean": 0.6436142325401306, "rewards/length2tails_reward/std": 0.367124080657959, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.186849594116211, "rewards/thermo_reward/std": 2.4702565670013428, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.1875, "completions/mean_terminated_length": 271.1875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11062491126358509, "epoch": 0.264, "frac_reward_zero_std": 0.0, "grad_norm": 0.11164043098688126, "learning_rate": 1.9946108886648925e-06, "loss": 0.0007, "num_tokens": 1142320.0, "reward": 8.521629333496094, "reward_std": 8.78165054321289, "rewards/fitness_reward/mean": 5.5364556312561035, "rewards/fitness_reward/std": 4.161991119384766, "rewards/kidney_reward/mean": 1.4564990997314453, "rewards/kidney_reward/std": 2.1312105655670166, "rewards/length2tails_reward/mean": 0.6381245851516724, "rewards/length2tails_reward/std": 0.3892870545387268, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.3648622035980225, "rewards/thermo_reward/std": 2.9711556434631348, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.9375, "completions/mean_terminated_length": 270.9375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10500903520733118, "epoch": 0.266, "frac_reward_zero_std": 0.0, "grad_norm": 0.06918232887983322, "learning_rate": 1.994477126175629e-06, "loss": -0.0014, "num_tokens": 1151022.0, "reward": 6.013577461242676, "reward_std": 9.91647720336914, "rewards/fitness_reward/mean": 4.206317901611328, "rewards/fitness_reward/std": 5.031452655792236, "rewards/kidney_reward/mean": 1.0269017219543457, "rewards/kidney_reward/std": 2.391968250274658, "rewards/length2tails_reward/mean": 0.6653873920440674, "rewards/length2tails_reward/std": 0.3800601363182068, "rewards/repeated_in_batch_reward/mean": 0.90625, "rewards/repeated_in_batch_reward/std": 0.2961445748806, "rewards/thermo_reward/mean": 0.6231940984725952, "rewards/thermo_reward/std": 3.0378589630126953, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.375, "completions/mean_terminated_length": 270.375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.09510938823223114, "epoch": 0.268, "frac_reward_zero_std": 0.0, "grad_norm": 0.10929831117391586, "learning_rate": 1.994341728518606e-06, "loss": 0.006, "num_tokens": 1159706.0, "reward": 4.943110942840576, "reward_std": 10.271505355834961, "rewards/fitness_reward/mean": 3.783754825592041, "rewards/fitness_reward/std": 5.177763938903809, "rewards/kidney_reward/mean": 0.6879823207855225, "rewards/kidney_reward/std": 2.4497663974761963, "rewards/length2tails_reward/mean": 0.5846083164215088, "rewards/length2tails_reward/std": 0.44055286049842834, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.3129129707813263, "rewards/thermo_reward/std": 3.2092156410217285, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 263.65625, "completions/mean_terminated_length": 263.65625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.11222867853939533, "epoch": 0.27, "frac_reward_zero_std": 0.0, "grad_norm": 0.32465440034866333, "learning_rate": 1.994204695916451e-06, "loss": -0.0471, "num_tokens": 1168175.0, "reward": 5.701061248779297, "reward_std": 10.690949440002441, "rewards/fitness_reward/mean": 3.6731255054473877, "rewards/fitness_reward/std": 5.350979328155518, "rewards/kidney_reward/mean": 0.8555735945701599, "rewards/kidney_reward/std": 2.5589215755462646, "rewards/length2tails_reward/mean": 0.7329180836677551, "rewards/length2tails_reward/std": 0.31912535429000854, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.9990701675415039, "rewards/thermo_reward/std": 3.3446438312530518, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 269.40625, "completions/mean_terminated_length": 269.40625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09923252556473017, "epoch": 0.272, "frac_reward_zero_std": 0.0, "grad_norm": 0.06646724790334702, "learning_rate": 1.99406602859448e-06, "loss": 0.0031, "num_tokens": 1176828.0, "reward": 8.432554244995117, "reward_std": 7.6039862632751465, "rewards/fitness_reward/mean": 5.432223320007324, "rewards/fitness_reward/std": 4.257699966430664, "rewards/kidney_reward/mean": 1.6723767518997192, "rewards/kidney_reward/std": 1.8270021677017212, "rewards/length2tails_reward/mean": 0.5134449005126953, "rewards/length2tails_reward/std": 0.4071425497531891, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.1766104698181152, "rewards/thermo_reward/std": 2.9092674255371094, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.0625, "completions/mean_terminated_length": 270.0625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.1019074423238635, "epoch": 0.274, "frac_reward_zero_std": 0.0, "grad_norm": 0.06599363684654236, "learning_rate": 1.9939257267806963e-06, "loss": 0.0037, "num_tokens": 1185502.0, "reward": 6.927309989929199, "reward_std": 9.177210807800293, "rewards/fitness_reward/mean": 4.872681617736816, "rewards/fitness_reward/std": 4.654974460601807, "rewards/kidney_reward/mean": 1.2820192575454712, "rewards/kidney_reward/std": 2.128758668899536, "rewards/length2tails_reward/mean": 0.6186249256134033, "rewards/length2tails_reward/std": 0.3422185778617859, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.6107470989227295, "rewards/thermo_reward/std": 3.1053736209869385, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.15625, "completions/mean_terminated_length": 269.15625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.09770497446879745, "epoch": 0.276, "frac_reward_zero_std": 0.0, "grad_norm": 0.0907408595085144, "learning_rate": 1.9937837907057903e-06, "loss": 0.0023, "num_tokens": 1194147.0, "reward": 8.242408752441406, "reward_std": 7.911430835723877, "rewards/fitness_reward/mean": 5.498637676239014, "rewards/fitness_reward/std": 4.1017327308654785, "rewards/kidney_reward/mean": 1.4611058235168457, "rewards/kidney_reward/std": 1.9765921831130981, "rewards/length2tails_reward/mean": 0.4971846342086792, "rewards/length2tails_reward/std": 0.4135686755180359, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.1329469680786133, "rewards/thermo_reward/std": 2.697221517562866, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.21875, "completions/mean_terminated_length": 270.21875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10403876565396786, "epoch": 0.278, "frac_reward_zero_std": 0.0, "grad_norm": 0.06455550342798233, "learning_rate": 1.993640220603141e-06, "loss": -0.0012, "num_tokens": 1202826.0, "reward": 10.303912162780762, "reward_std": 6.239825248718262, "rewards/fitness_reward/mean": 6.35327672958374, "rewards/fitness_reward/std": 3.1890387535095215, "rewards/kidney_reward/mean": 2.065305233001709, "rewards/kidney_reward/std": 1.3484585285186768, "rewards/length2tails_reward/mean": 0.5866552591323853, "rewards/length2tails_reward/std": 0.38823702931404114, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.7266652584075928, "rewards/thermo_reward/std": 2.520035743713379, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 270.25, "completions/mean_terminated_length": 270.25, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09568120446056128, "epoch": 0.28, "frac_reward_zero_std": 0.0, "grad_norm": 0.24781930446624756, "learning_rate": 1.993495016708813e-06, "loss": 0.0036, "num_tokens": 1211506.0, "reward": 7.731461048126221, "reward_std": 8.551918983459473, "rewards/fitness_reward/mean": 4.975318431854248, "rewards/fitness_reward/std": 4.3804097175598145, "rewards/kidney_reward/mean": 1.2619259357452393, "rewards/kidney_reward/std": 2.03277850151062, "rewards/length2tails_reward/mean": 0.4962129592895508, "rewards/length2tails_reward/std": 0.44050833582878113, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.344595193862915, "rewards/thermo_reward/std": 2.948279619216919, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.96875, "completions/mean_terminated_length": 269.96875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09721511509269476, "epoch": 0.282, "frac_reward_zero_std": 0.0, "grad_norm": 0.07676605135202408, "learning_rate": 1.9933481792615583e-06, "loss": 0.0034, "num_tokens": 1220177.0, "reward": 7.450370788574219, "reward_std": 9.537321090698242, "rewards/fitness_reward/mean": 4.905481338500977, "rewards/fitness_reward/std": 4.722832202911377, "rewards/kidney_reward/mean": 1.3217523097991943, "rewards/kidney_reward/std": 2.265947103500366, "rewards/length2tails_reward/mean": 0.5830744504928589, "rewards/length2tails_reward/std": 0.3848836421966553, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.0648295879364014, "rewards/thermo_reward/std": 3.070201873779297, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.21875, "completions/mean_terminated_length": 269.21875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09513065218925476, "epoch": 0.284, "frac_reward_zero_std": 0.0, "grad_norm": 0.14675599336624146, "learning_rate": 1.9931997085028128e-06, "loss": -0.0064, "num_tokens": 1228824.0, "reward": 7.320596218109131, "reward_std": 9.154064178466797, "rewards/fitness_reward/mean": 5.166498184204102, "rewards/fitness_reward/std": 4.510073184967041, "rewards/kidney_reward/mean": 1.226707935333252, "rewards/kidney_reward/std": 2.355365037918091, "rewards/length2tails_reward/mean": 0.5027331709861755, "rewards/length2tails_reward/std": 0.38256677985191345, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.7771167755126953, "rewards/thermo_reward/std": 3.0694499015808105, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 264.46875, "completions/mean_terminated_length": 264.46875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.10037917224690318, "epoch": 0.286, "frac_reward_zero_std": 0.0, "grad_norm": 0.42658743262290955, "learning_rate": 1.9930496046767007e-06, "loss": -0.0316, "num_tokens": 1237319.0, "reward": 6.065443992614746, "reward_std": 9.798822402954102, "rewards/fitness_reward/mean": 4.448328495025635, "rewards/fitness_reward/std": 5.0034284591674805, "rewards/kidney_reward/mean": 0.932793378829956, "rewards/kidney_reward/std": 2.4061214923858643, "rewards/length2tails_reward/mean": 0.5339027047157288, "rewards/length2tails_reward/std": 0.4264541268348694, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.5309317111968994, "rewards/thermo_reward/std": 3.0993340015411377, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 272.78125, "completions/mean_terminated_length": 272.78125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09499884769320488, "epoch": 0.288, "frac_reward_zero_std": 0.0, "grad_norm": 0.3067867159843445, "learning_rate": 1.9928978680300293e-06, "loss": 0.0005, "num_tokens": 1246080.0, "reward": 4.834901809692383, "reward_std": 10.813472747802734, "rewards/fitness_reward/mean": 3.440321445465088, "rewards/fitness_reward/std": 5.519410133361816, "rewards/kidney_reward/mean": 0.6061902642250061, "rewards/kidney_reward/std": 2.588884115219116, "rewards/length2tails_reward/mean": 0.7453749179840088, "rewards/length2tails_reward/std": 0.3469243347644806, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.6138525605201721, "rewards/thermo_reward/std": 3.252837657928467, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.96875, "completions/mean_terminated_length": 269.96875, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 0.11109276209026575, "epoch": 0.29, "frac_reward_zero_std": 0.0, "grad_norm": 0.43782028555870056, "learning_rate": 1.9927444988122917e-06, "loss": -0.0001, "num_tokens": 1254751.0, "reward": 6.893056392669678, "reward_std": 9.528643608093262, "rewards/fitness_reward/mean": 4.82741641998291, "rewards/fitness_reward/std": 4.739230155944824, "rewards/kidney_reward/mean": 1.2925114631652832, "rewards/kidney_reward/std": 2.267174482345581, "rewards/length2tails_reward/mean": 0.6154762506484985, "rewards/length2tails_reward/std": 0.3850167989730835, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.6115807294845581, "rewards/thermo_reward/std": 3.185163974761963, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.5, "completions/mean_terminated_length": 271.5, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10539795272052288, "epoch": 0.292, "frac_reward_zero_std": 0.0, "grad_norm": 0.1180371344089508, "learning_rate": 1.992589497275665e-06, "loss": 0.001, "num_tokens": 1263471.0, "reward": 8.197807312011719, "reward_std": 9.033075332641602, "rewards/fitness_reward/mean": 5.30934476852417, "rewards/fitness_reward/std": 4.3426971435546875, "rewards/kidney_reward/mean": 1.4229786396026611, "rewards/kidney_reward/std": 2.1725473403930664, "rewards/length2tails_reward/mean": 0.7128552198410034, "rewards/length2tails_reward/std": 0.3286351263523102, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.2941994667053223, "rewards/thermo_reward/std": 2.925856590270996, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.78125, "completions/mean_terminated_length": 269.78125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11158214416354895, "epoch": 0.294, "frac_reward_zero_std": 0.0, "grad_norm": 0.08068758994340897, "learning_rate": 1.99243286367501e-06, "loss": 0.0001, "num_tokens": 1272136.0, "reward": 10.795753479003906, "reward_std": 6.373899936676025, "rewards/fitness_reward/mean": 6.341925621032715, "rewards/fitness_reward/std": 3.2274014949798584, "rewards/kidney_reward/mean": 2.1037323474884033, "rewards/kidney_reward/std": 1.3544044494628906, "rewards/length2tails_reward/mean": 0.6009948253631592, "rewards/length2tails_reward/std": 0.3666466176509857, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.1899969577789307, "rewards/thermo_reward/std": 2.2698984146118164, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.1875, "completions/mean_terminated_length": 270.1875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10888968966901302, "epoch": 0.296, "frac_reward_zero_std": 0.0, "grad_norm": 0.06824659556150436, "learning_rate": 1.9922745982678716e-06, "loss": -0.003, "num_tokens": 1280814.0, "reward": 8.972225189208984, "reward_std": 6.666226387023926, "rewards/fitness_reward/mean": 5.843311786651611, "rewards/fitness_reward/std": 3.766287088394165, "rewards/kidney_reward/mean": 1.5617635250091553, "rewards/kidney_reward/std": 1.6977827548980713, "rewards/length2tails_reward/mean": 0.566065788269043, "rewards/length2tails_reward/std": 0.3749234974384308, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.4105430841445923, "rewards/thermo_reward/std": 2.7509870529174805, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 270.34375, "completions/mean_terminated_length": 270.34375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10543608199805021, "epoch": 0.298, "frac_reward_zero_std": 0.0, "grad_norm": 0.062118224799633026, "learning_rate": 1.9921147013144777e-06, "loss": 0.0018, "num_tokens": 1289497.0, "reward": 9.55693244934082, "reward_std": 7.30353307723999, "rewards/fitness_reward/mean": 6.239426612854004, "rewards/fitness_reward/std": 3.359330177307129, "rewards/kidney_reward/mean": 1.744706392288208, "rewards/kidney_reward/std": 1.7386865615844727, "rewards/length2tails_reward/mean": 0.6120542883872986, "rewards/length2tails_reward/std": 0.3751331865787506, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.4115936756134033, "rewards/thermo_reward/std": 3.0637288093566895, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.4375, "completions/mean_terminated_length": 271.4375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10638347268104553, "epoch": 0.3, "frac_reward_zero_std": 0.0, "grad_norm": 0.07195970416069031, "learning_rate": 1.991953173077738e-06, "loss": -0.0008, "num_tokens": 1298215.0, "reward": 11.724641799926758, "reward_std": 4.774008750915527, "rewards/fitness_reward/mean": 7.004184722900391, "rewards/fitness_reward/std": 2.019498586654663, "rewards/kidney_reward/mean": 2.293221950531006, "rewards/kidney_reward/std": 1.1455600261688232, "rewards/length2tails_reward/mean": 0.7141435146331787, "rewards/length2tails_reward/std": 0.3148179054260254, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.2558200359344482, "rewards/thermo_reward/std": 2.349818468093872, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 270.3125, "completions/mean_terminated_length": 270.3125, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 0.10944250784814358, "epoch": 0.302, "frac_reward_zero_std": 0.0, "grad_norm": 0.13008372485637665, "learning_rate": 1.991790013823246e-06, "loss": -0.003, "num_tokens": 1306897.0, "reward": 9.20780086517334, "reward_std": 7.8232316970825195, "rewards/fitness_reward/mean": 5.768603324890137, "rewards/fitness_reward/std": 4.000181674957275, "rewards/kidney_reward/mean": 1.7911608219146729, "rewards/kidney_reward/std": 1.9888068437576294, "rewards/length2tails_reward/mean": 0.5865706205368042, "rewards/length2tails_reward/std": 0.39467447996139526, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.4893792867660522, "rewards/thermo_reward/std": 2.8161673545837402, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.40625, "completions/mean_terminated_length": 270.40625, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 0.10371760232374072, "epoch": 0.304, "frac_reward_zero_std": 0.0, "grad_norm": 0.25305068492889404, "learning_rate": 1.9916252238192755e-06, "loss": -0.0012, "num_tokens": 1315582.0, "reward": 7.669151306152344, "reward_std": 8.788591384887695, "rewards/fitness_reward/mean": 5.513693809509277, "rewards/fitness_reward/std": 4.228607654571533, "rewards/kidney_reward/mean": 1.3250335454940796, "rewards/kidney_reward/std": 2.290503740310669, "rewards/length2tails_reward/mean": 0.6143225431442261, "rewards/length2tails_reward/std": 0.39379915595054626, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.66899174451828, "rewards/thermo_reward/std": 2.995438575744629, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.28125, "completions/mean_terminated_length": 270.28125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.1102075595408678, "epoch": 0.306, "frac_reward_zero_std": 0.0, "grad_norm": 0.1603720337152481, "learning_rate": 1.991458803336782e-06, "loss": -0.0001, "num_tokens": 1324263.0, "reward": 9.525554656982422, "reward_std": 6.73523473739624, "rewards/fitness_reward/mean": 6.219589710235596, "rewards/fitness_reward/std": 3.4215612411499023, "rewards/kidney_reward/mean": 1.6574617624282837, "rewards/kidney_reward/std": 1.7385835647583008, "rewards/length2tails_reward/mean": 0.6276772022247314, "rewards/length2tails_reward/std": 0.36669981479644775, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.4857351779937744, "rewards/thermo_reward/std": 2.6110165119171143, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.1875, "completions/mean_terminated_length": 270.1875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11156374774873257, "epoch": 0.308, "frac_reward_zero_std": 0.0, "grad_norm": 0.06302237510681152, "learning_rate": 1.9912907526494026e-06, "loss": -0.0008, "num_tokens": 1332941.0, "reward": 9.845077514648438, "reward_std": 6.1798095703125, "rewards/fitness_reward/mean": 6.535126686096191, "rewards/fitness_reward/std": 2.8043503761291504, "rewards/kidney_reward/mean": 1.7477779388427734, "rewards/kidney_reward/std": 1.621852993965149, "rewards/length2tails_reward/mean": 0.5888516902923584, "rewards/length2tails_reward/std": 0.3823993504047394, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.403287649154663, "rewards/thermo_reward/std": 2.6505541801452637, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 268.71875, "completions/mean_terminated_length": 268.71875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09071650542318821, "epoch": 0.31, "frac_reward_zero_std": 0.0, "grad_norm": 0.1547355204820633, "learning_rate": 1.9911210720334545e-06, "loss": -0.0015, "num_tokens": 1341572.0, "reward": 8.669211387634277, "reward_std": 8.183530807495117, "rewards/fitness_reward/mean": 5.633013725280762, "rewards/fitness_reward/std": 4.088678359985352, "rewards/kidney_reward/mean": 1.656423807144165, "rewards/kidney_reward/std": 1.9646308422088623, "rewards/length2tails_reward/mean": 0.4567784070968628, "rewards/length2tails_reward/std": 0.37301844358444214, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.2340962886810303, "rewards/thermo_reward/std": 2.6925230026245117, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.3125, "completions/mean_terminated_length": 269.3125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10991678666323423, "epoch": 0.312, "frac_reward_zero_std": 0.0, "grad_norm": 0.13348576426506042, "learning_rate": 1.9909497617679347e-06, "loss": 0.0, "num_tokens": 1350222.0, "reward": 9.542964935302734, "reward_std": 6.098407745361328, "rewards/fitness_reward/mean": 6.580748558044434, "rewards/fitness_reward/std": 2.6743195056915283, "rewards/kidney_reward/mean": 1.6786231994628906, "rewards/kidney_reward/std": 1.656028389930725, "rewards/length2tails_reward/mean": 0.5179070234298706, "rewards/length2tails_reward/std": 0.3983818292617798, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.1318026781082153, "rewards/thermo_reward/std": 2.8300745487213135, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.625, "completions/mean_terminated_length": 269.625, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.10514571238309145, "epoch": 0.314, "frac_reward_zero_std": 0.0, "grad_norm": 0.8836485147476196, "learning_rate": 1.99077682213452e-06, "loss": -0.0252, "num_tokens": 1358882.0, "reward": 11.270038604736328, "reward_std": 6.071025848388672, "rewards/fitness_reward/mean": 6.54879903793335, "rewards/fitness_reward/std": 2.982909917831421, "rewards/kidney_reward/mean": 2.0715489387512207, "rewards/kidney_reward/std": 1.5798089504241943, "rewards/length2tails_reward/mean": 0.6404703259468079, "rewards/length2tails_reward/std": 0.3451295495033264, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.4856441020965576, "rewards/thermo_reward/std": 2.362983465194702, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 267.6875, "completions/mean_terminated_length": 267.6875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "entropy": 0.09205622784793377, "epoch": 0.316, "frac_reward_zero_std": 0.0, "grad_norm": 0.12054961919784546, "learning_rate": 1.9906022534175653e-06, "loss": -0.0152, "num_tokens": 1367480.0, "reward": 5.936069965362549, "reward_std": 8.101700782775879, "rewards/fitness_reward/mean": 4.768802642822266, "rewards/fitness_reward/std": 4.337706089019775, "rewards/kidney_reward/mean": 0.7667664289474487, "rewards/kidney_reward/std": 2.0656919479370117, "rewards/length2tails_reward/mean": 0.5114237070083618, "rewards/length2tails_reward/std": 0.4007585048675537, "rewards/repeated_in_batch_reward/mean": 0.875, "rewards/repeated_in_batch_reward/std": 0.33601075410842896, "rewards/thermo_reward/mean": 0.261858731508255, "rewards/thermo_reward/std": 3.0215578079223633, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.53125, "completions/mean_terminated_length": 270.53125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11868708301335573, "epoch": 0.318, "frac_reward_zero_std": 0.0, "grad_norm": 0.17192107439041138, "learning_rate": 1.9904260559041067e-06, "loss": 0.0031, "num_tokens": 1376169.0, "reward": 10.619016647338867, "reward_std": 6.427189350128174, "rewards/fitness_reward/mean": 6.33050012588501, "rewards/fitness_reward/std": 3.0782217979431152, "rewards/kidney_reward/mean": 1.9571011066436768, "rewards/kidney_reward/std": 1.5345371961593628, "rewards/length2tails_reward/mean": 0.639854907989502, "rewards/length2tails_reward/std": 0.35456106066703796, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.1674301624298096, "rewards/thermo_reward/std": 2.3626694679260254, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.25, "completions/mean_terminated_length": 270.25, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10438992222771049, "epoch": 0.32, "frac_reward_zero_std": 0.0, "grad_norm": 0.09375450015068054, "learning_rate": 1.990248229883855e-06, "loss": -0.0047, "num_tokens": 1384849.0, "reward": 8.270386695861816, "reward_std": 7.849250793457031, "rewards/fitness_reward/mean": 5.114340782165527, "rewards/fitness_reward/std": 4.376468658447266, "rewards/kidney_reward/mean": 1.6285489797592163, "rewards/kidney_reward/std": 1.7222367525100708, "rewards/length2tails_reward/mean": 0.5605906248092651, "rewards/length2tails_reward/std": 0.39478322863578796, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.371437430381775, "rewards/thermo_reward/std": 2.751140832901001, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.34375, "completions/mean_terminated_length": 270.34375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.1004866361618042, "epoch": 0.322, "frac_reward_zero_std": 0.0, "grad_norm": 0.06271824985742569, "learning_rate": 1.9900687756492018e-06, "loss": -0.0021, "num_tokens": 1393532.0, "reward": 7.980679512023926, "reward_std": 9.225789070129395, "rewards/fitness_reward/mean": 4.897353649139404, "rewards/fitness_reward/std": 4.632526397705078, "rewards/kidney_reward/mean": 1.4106215238571167, "rewards/kidney_reward/std": 2.1333272457122803, "rewards/length2tails_reward/mean": 0.6001532077789307, "rewards/length2tails_reward/std": 0.36171630024909973, "rewards/repeated_in_batch_reward/mean": 0.90625, "rewards/repeated_in_batch_reward/std": 0.2961445748806, "rewards/thermo_reward/mean": 1.5220634937286377, "rewards/thermo_reward/std": 2.998164176940918, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 265.5625, "completions/mean_terminated_length": 265.5625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.11199045088142157, "epoch": 0.324, "frac_reward_zero_std": 0.0, "grad_norm": 0.3692063093185425, "learning_rate": 1.9898876934952135e-06, "loss": -0.0394, "num_tokens": 1402062.0, "reward": 8.863862991333008, "reward_std": 6.968045711517334, "rewards/fitness_reward/mean": 5.994227409362793, "rewards/fitness_reward/std": 3.6847662925720215, "rewards/kidney_reward/mean": 1.8423200845718384, "rewards/kidney_reward/std": 1.663252830505371, "rewards/length2tails_reward/mean": 0.4631441533565521, "rewards/length2tails_reward/std": 0.41804710030555725, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.8810014724731445, "rewards/thermo_reward/std": 2.657252311706543, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 269.375, "completions/mean_terminated_length": 269.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09533638320863247, "epoch": 0.326, "frac_reward_zero_std": 0.0, "grad_norm": 0.051175836473703384, "learning_rate": 1.9897049837196347e-06, "loss": 0.003, "num_tokens": 1410714.0, "reward": 8.676294326782227, "reward_std": 8.282102584838867, "rewards/fitness_reward/mean": 5.569098949432373, "rewards/fitness_reward/std": 4.0911359786987305, "rewards/kidney_reward/mean": 1.6246623992919922, "rewards/kidney_reward/std": 1.9904649257659912, "rewards/length2tails_reward/mean": 0.5004905462265015, "rewards/length2tails_reward/std": 0.3728918433189392, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.3324849605560303, "rewards/thermo_reward/std": 2.884413242340088, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.28125, "completions/mean_terminated_length": 270.28125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10455688275396824, "epoch": 0.328, "frac_reward_zero_std": 0.0, "grad_norm": 0.07528374344110489, "learning_rate": 1.989520646622886e-06, "loss": 0.0024, "num_tokens": 1419395.0, "reward": 6.963508605957031, "reward_std": 9.04922103881836, "rewards/fitness_reward/mean": 4.957913398742676, "rewards/fitness_reward/std": 4.619761943817139, "rewards/kidney_reward/mean": 1.1970510482788086, "rewards/kidney_reward/std": 2.2081282138824463, "rewards/length2tails_reward/mean": 0.6426414251327515, "rewards/length2tails_reward/std": 0.3601728081703186, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.6442803144454956, "rewards/thermo_reward/std": 2.9896538257598877, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.53125, "completions/mean_terminated_length": 269.53125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.1044129366055131, "epoch": 0.33, "frac_reward_zero_std": 0.0, "grad_norm": 0.11145590245723724, "learning_rate": 1.989334682508062e-06, "loss": 0.0031, "num_tokens": 1428052.0, "reward": 8.322443008422852, "reward_std": 8.941901206970215, "rewards/fitness_reward/mean": 5.196577072143555, "rewards/fitness_reward/std": 4.4512152671813965, "rewards/kidney_reward/mean": 1.5438371896743774, "rewards/kidney_reward/std": 2.067178726196289, "rewards/length2tails_reward/mean": 0.5185511708259583, "rewards/length2tails_reward/std": 0.39966416358947754, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.4301741123199463, "rewards/thermo_reward/std": 2.9776289463043213, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 269.90625, "completions/mean_terminated_length": 269.90625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10911750234663486, "epoch": 0.332, "frac_reward_zero_std": 0.0, "grad_norm": 0.4272436201572418, "learning_rate": 1.989147091680936e-06, "loss": 0.0016, "num_tokens": 1436721.0, "reward": 11.553678512573242, "reward_std": 6.220656394958496, "rewards/fitness_reward/mean": 6.640511989593506, "rewards/fitness_reward/std": 2.8358936309814453, "rewards/kidney_reward/mean": 2.229365587234497, "rewards/kidney_reward/std": 1.4573208093643188, "rewards/length2tails_reward/mean": 0.6149606704711914, "rewards/length2tails_reward/std": 0.34033235907554626, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.5223050117492676, "rewards/thermo_reward/std": 2.36811900138855, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 270.0, "completions/mean_terminated_length": 270.0, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09208371117711067, "epoch": 0.334, "frac_reward_zero_std": 0.0, "grad_norm": 0.13034819066524506, "learning_rate": 1.988957874449953e-06, "loss": 0.0036, "num_tokens": 1445393.0, "reward": 8.939111709594727, "reward_std": 9.140274047851562, "rewards/fitness_reward/mean": 5.218906402587891, "rewards/fitness_reward/std": 4.415480613708496, "rewards/kidney_reward/mean": 1.5986276865005493, "rewards/kidney_reward/std": 2.1802477836608887, "rewards/length2tails_reward/mean": 0.5977396965026855, "rewards/length2tails_reward/std": 0.3474375307559967, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.961803674697876, "rewards/thermo_reward/std": 2.926722764968872, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.1023161243647337, "epoch": 0.336, "frac_reward_zero_std": 0.0, "grad_norm": 0.05429290235042572, "learning_rate": 1.9887670311262328e-06, "loss": -0.0021, "num_tokens": 1454109.0, "reward": 11.80929183959961, "reward_std": 4.472561836242676, "rewards/fitness_reward/mean": 6.948788166046143, "rewards/fitness_reward/std": 2.023350238800049, "rewards/kidney_reward/mean": 2.316553831100464, "rewards/kidney_reward/std": 1.0072654485702515, "rewards/length2tails_reward/mean": 0.6750909686088562, "rewards/length2tails_reward/std": 0.3512890338897705, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.376441478729248, "rewards/thermo_reward/std": 2.003629684448242, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.96875, "completions/mean_terminated_length": 271.96875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10454714018851519, "epoch": 0.338, "frac_reward_zero_std": 0.0, "grad_norm": 0.1053980216383934, "learning_rate": 1.9885745620235697e-06, "loss": -0.0068, "num_tokens": 1462844.0, "reward": 9.110237121582031, "reward_std": 7.532092094421387, "rewards/fitness_reward/mean": 5.710217475891113, "rewards/fitness_reward/std": 3.8100883960723877, "rewards/kidney_reward/mean": 1.6665058135986328, "rewards/kidney_reward/std": 1.8492753505706787, "rewards/length2tails_reward/mean": 0.6642658710479736, "rewards/length2tails_reward/std": 0.39535000920295715, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.567087173461914, "rewards/thermo_reward/std": 2.6299617290496826, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.25, "completions/mean_terminated_length": 270.25, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08847399707883596, "epoch": 0.34, "frac_reward_zero_std": 0.0, "grad_norm": 0.07097867131233215, "learning_rate": 1.9883804674584306e-06, "loss": 0.0026, "num_tokens": 1471524.0, "reward": 6.478607177734375, "reward_std": 9.610648155212402, "rewards/fitness_reward/mean": 4.710236549377441, "rewards/fitness_reward/std": 4.720317840576172, "rewards/kidney_reward/mean": 0.9129420518875122, "rewards/kidney_reward/std": 2.3220455646514893, "rewards/length2tails_reward/mean": 0.5628418922424316, "rewards/length2tails_reward/std": 0.41073399782180786, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.6991442441940308, "rewards/thermo_reward/std": 3.20623517036438, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.21875, "completions/mean_terminated_length": 273.21875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11202279198914766, "epoch": 0.342, "frac_reward_zero_std": 0.0, "grad_norm": 0.11420899629592896, "learning_rate": 1.9881847477499555e-06, "loss": 0.0025, "num_tokens": 1480299.0, "reward": 7.849916458129883, "reward_std": 9.838308334350586, "rewards/fitness_reward/mean": 4.82754373550415, "rewards/fitness_reward/std": 4.876658916473389, "rewards/kidney_reward/mean": 1.3583157062530518, "rewards/kidney_reward/std": 2.398590564727783, "rewards/length2tails_reward/mean": 0.7617112994194031, "rewards/length2tails_reward/std": 0.3424564301967621, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.4878864288330078, "rewards/thermo_reward/std": 3.0041966438293457, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.4375, "completions/mean_terminated_length": 271.4375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10550755634903908, "epoch": 0.344, "frac_reward_zero_std": 0.0, "grad_norm": 0.4681124985218048, "learning_rate": 1.9879874032199563e-06, "loss": 0.003, "num_tokens": 1489017.0, "reward": 7.871708869934082, "reward_std": 9.376019477844238, "rewards/fitness_reward/mean": 5.116046905517578, "rewards/fitness_reward/std": 4.47206449508667, "rewards/kidney_reward/mean": 1.302150011062622, "rewards/kidney_reward/std": 2.3035614490509033, "rewards/length2tails_reward/mean": 0.6878268718719482, "rewards/length2tails_reward/std": 0.35356101393699646, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.2847297191619873, "rewards/thermo_reward/std": 2.9786298274993896, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.875, "completions/mean_terminated_length": 270.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10861378256231546, "epoch": 0.346, "frac_reward_zero_std": 0.0, "grad_norm": 0.10255390405654907, "learning_rate": 1.9877884341929164e-06, "loss": -0.0002, "num_tokens": 1497717.0, "reward": 10.451683044433594, "reward_std": 6.380789279937744, "rewards/fitness_reward/mean": 6.58067512512207, "rewards/fitness_reward/std": 2.850431442260742, "rewards/kidney_reward/mean": 1.9992852210998535, "rewards/kidney_reward/std": 1.6703037023544312, "rewards/length2tails_reward/mean": 0.642656683921814, "rewards/length2tails_reward/std": 0.3293820917606354, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.7074573040008545, "rewards/thermo_reward/std": 2.617042064666748, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.75, "completions/mean_terminated_length": 269.75, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10535236541181803, "epoch": 0.348, "frac_reward_zero_std": 0.0, "grad_norm": 0.1874845027923584, "learning_rate": 1.9875878409959902e-06, "loss": 0.0, "num_tokens": 1506381.0, "reward": 11.276988983154297, "reward_std": 4.965340614318848, "rewards/fitness_reward/mean": 6.700150489807129, "rewards/fitness_reward/std": 2.6060447692871094, "rewards/kidney_reward/mean": 2.1925644874572754, "rewards/kidney_reward/std": 1.1760989427566528, "rewards/length2tails_reward/mean": 0.5278898477554321, "rewards/length2tails_reward/std": 0.39283841848373413, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.23148512840271, "rewards/thermo_reward/std": 2.2584495544433594, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.1875, "completions/mean_terminated_length": 270.1875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.1035816827788949, "epoch": 0.35, "frac_reward_zero_std": 0.0, "grad_norm": 0.06968332827091217, "learning_rate": 1.9873856239590034e-06, "loss": -0.0035, "num_tokens": 1515059.0, "reward": 9.632209777832031, "reward_std": 7.117125988006592, "rewards/fitness_reward/mean": 5.655714988708496, "rewards/fitness_reward/std": 4.042262077331543, "rewards/kidney_reward/mean": 1.9653785228729248, "rewards/kidney_reward/std": 1.48482084274292, "rewards/length2tails_reward/mean": 0.5557355284690857, "rewards/length2tails_reward/std": 0.4126896262168884, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 1.8617929220199585, "rewards/thermo_reward/std": 2.4798786640167236, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.59375, "completions/mean_terminated_length": 271.59375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09991635009646416, "epoch": 0.352, "frac_reward_zero_std": 0.0, "grad_norm": 0.07047705352306366, "learning_rate": 1.98718178341445e-06, "loss": -0.0047, "num_tokens": 1523782.0, "reward": 9.619202613830566, "reward_std": 7.000319004058838, "rewards/fitness_reward/mean": 5.932534694671631, "rewards/fitness_reward/std": 3.6836514472961426, "rewards/kidney_reward/mean": 1.84051513671875, "rewards/kidney_reward/std": 1.6164661645889282, "rewards/length2tails_reward/mean": 0.6280834674835205, "rewards/length2tails_reward/std": 0.4062521755695343, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.6833446025848389, "rewards/thermo_reward/std": 2.712756395339966, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.125, "completions/mean_terminated_length": 270.125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09833968244493008, "epoch": 0.354, "frac_reward_zero_std": 0.0, "grad_norm": 0.06103222444653511, "learning_rate": 1.9869763196974956e-06, "loss": -0.0038, "num_tokens": 1532458.0, "reward": 8.60630989074707, "reward_std": 8.169486999511719, "rewards/fitness_reward/mean": 5.390318393707275, "rewards/fitness_reward/std": 4.1763691902160645, "rewards/kidney_reward/mean": 1.568656325340271, "rewards/kidney_reward/std": 1.8943867683410645, "rewards/length2tails_reward/mean": 0.5682525634765625, "rewards/length2tails_reward/std": 0.4164934456348419, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 1.4967609643936157, "rewards/thermo_reward/std": 2.6284830570220947, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 270.3125, "completions/mean_terminated_length": 270.3125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10469309892505407, "epoch": 0.356, "frac_reward_zero_std": 0.0, "grad_norm": 0.15595051646232605, "learning_rate": 1.9867692331459733e-06, "loss": -0.0012, "num_tokens": 1541140.0, "reward": 7.887426376342773, "reward_std": 8.45262336730957, "rewards/fitness_reward/mean": 5.164236068725586, "rewards/fitness_reward/std": 4.271865367889404, "rewards/kidney_reward/mean": 1.5095407962799072, "rewards/kidney_reward/std": 1.9843440055847168, "rewards/length2tails_reward/mean": 0.5877651572227478, "rewards/length2tails_reward/std": 0.36727017164230347, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.0548732280731201, "rewards/thermo_reward/std": 2.857112169265747, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.78125, "completions/mean_terminated_length": 271.78125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.1056223763152957, "epoch": 0.358, "frac_reward_zero_std": 0.0, "grad_norm": 0.06294183433055878, "learning_rate": 1.9865605241003845e-06, "loss": -0.0045, "num_tokens": 1549869.0, "reward": 10.02109146118164, "reward_std": 6.616111755371094, "rewards/fitness_reward/mean": 6.333710670471191, "rewards/fitness_reward/std": 3.2548043727874756, "rewards/kidney_reward/mean": 1.9349606037139893, "rewards/kidney_reward/std": 1.5588946342468262, "rewards/length2tails_reward/mean": 0.681917667388916, "rewards/length2tails_reward/std": 0.3754919469356537, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.584228515625, "rewards/thermo_reward/std": 2.648305892944336, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.09375, "completions/mean_terminated_length": 271.09375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10342816868796945, "epoch": 0.36, "frac_reward_zero_std": 0.0, "grad_norm": 0.0756261870265007, "learning_rate": 1.9863501929038997e-06, "loss": -0.0007, "num_tokens": 1558576.0, "reward": 6.979414463043213, "reward_std": 9.660487174987793, "rewards/fitness_reward/mean": 4.5425872802734375, "rewards/fitness_reward/std": 4.841700553894043, "rewards/kidney_reward/mean": 1.0623948574066162, "rewards/kidney_reward/std": 2.2623515129089355, "rewards/length2tails_reward/mean": 0.658684253692627, "rewards/length2tails_reward/std": 0.381984144449234, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.2085641622543335, "rewards/thermo_reward/std": 3.015942096710205, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 271.875, "completions/mean_terminated_length": 271.875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.1091130031272769, "epoch": 0.362, "frac_reward_zero_std": 0.0, "grad_norm": 0.20066675543785095, "learning_rate": 1.9861382399023546e-06, "loss": -0.0014, "num_tokens": 1567308.0, "reward": 9.064428329467773, "reward_std": 7.4316086769104, "rewards/fitness_reward/mean": 6.176366329193115, "rewards/fitness_reward/std": 3.3721847534179688, "rewards/kidney_reward/mean": 1.4785151481628418, "rewards/kidney_reward/std": 1.8624529838562012, "rewards/length2tails_reward/mean": 0.6511343121528625, "rewards/length2tails_reward/std": 0.39212942123413086, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.2444343566894531, "rewards/thermo_reward/std": 2.9861016273498535, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.21875, "completions/mean_terminated_length": 271.21875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10733828134834766, "epoch": 0.364, "frac_reward_zero_std": 0.0, "grad_norm": 0.089052215218544, "learning_rate": 1.985924665444254e-06, "loss": -0.0008, "num_tokens": 1576019.0, "reward": 7.983680725097656, "reward_std": 8.451763153076172, "rewards/fitness_reward/mean": 5.157064437866211, "rewards/fitness_reward/std": 4.3942108154296875, "rewards/kidney_reward/mean": 1.4362343549728394, "rewards/kidney_reward/std": 2.0755233764648438, "rewards/length2tails_reward/mean": 0.6398859620094299, "rewards/length2tails_reward/std": 0.39266476035118103, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.2263935804367065, "rewards/thermo_reward/std": 2.855211019515991, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 268.6875, "completions/mean_terminated_length": 268.6875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0979474326595664, "epoch": 0.366, "frac_reward_zero_std": 0.0, "grad_norm": 0.07531240582466125, "learning_rate": 1.9857094698807663e-06, "loss": -0.0012, "num_tokens": 1584649.0, "reward": 9.46112060546875, "reward_std": 7.812624454498291, "rewards/fitness_reward/mean": 5.858367919921875, "rewards/fitness_reward/std": 3.7519853115081787, "rewards/kidney_reward/mean": 1.7861301898956299, "rewards/kidney_reward/std": 1.8386919498443604, "rewards/length2tails_reward/mean": 0.4857766628265381, "rewards/length2tails_reward/std": 0.35673651099205017, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.6680454015731812, "rewards/thermo_reward/std": 2.858015298843384, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.375, "completions/mean_terminated_length": 269.375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10918328817933798, "epoch": 0.368, "frac_reward_zero_std": 0.0, "grad_norm": 0.08639601618051529, "learning_rate": 1.9854926535657268e-06, "loss": 0.0024, "num_tokens": 1593301.0, "reward": 8.301219940185547, "reward_std": 7.373518943786621, "rewards/fitness_reward/mean": 6.095327377319336, "rewards/fitness_reward/std": 3.293491840362549, "rewards/kidney_reward/mean": 1.462351679801941, "rewards/kidney_reward/std": 1.9810431003570557, "rewards/length2tails_reward/mean": 0.5736892223358154, "rewards/length2tails_reward/std": 0.3360866606235504, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.5861721634864807, "rewards/thermo_reward/std": 3.047858953475952, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.21875, "completions/mean_terminated_length": 271.21875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09814856946468353, "epoch": 0.37, "frac_reward_zero_std": 0.0, "grad_norm": 0.27271583676338196, "learning_rate": 1.9852742168556353e-06, "loss": -0.0028, "num_tokens": 1602012.0, "reward": 6.0901641845703125, "reward_std": 10.192516326904297, "rewards/fitness_reward/mean": 3.8694372177124023, "rewards/fitness_reward/std": 5.288331031799316, "rewards/kidney_reward/mean": 0.9918411374092102, "rewards/kidney_reward/std": 2.3886590003967285, "rewards/length2tails_reward/mean": 0.6225963830947876, "rewards/length2tails_reward/std": 0.40456104278564453, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 1.0728765726089478, "rewards/thermo_reward/std": 3.126966953277588, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.53125, "completions/mean_terminated_length": 270.53125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10325660184025764, "epoch": 0.372, "frac_reward_zero_std": 0.0, "grad_norm": 0.5015514492988586, "learning_rate": 1.9850541601096565e-06, "loss": -0.0007, "num_tokens": 1610701.0, "reward": 7.214511871337891, "reward_std": 9.890998840332031, "rewards/fitness_reward/mean": 4.450111389160156, "rewards/fitness_reward/std": 5.011604309082031, "rewards/kidney_reward/mean": 1.2150135040283203, "rewards/kidney_reward/std": 2.3736073970794678, "rewards/length2tails_reward/mean": 0.5748029351234436, "rewards/length2tails_reward/std": 0.4244406521320343, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.391906976699829, "rewards/thermo_reward/std": 3.0063281059265137, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.15625, "completions/mean_terminated_length": 272.15625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10104412585496902, "epoch": 0.374, "frac_reward_zero_std": 0.0, "grad_norm": 0.09444596618413925, "learning_rate": 1.9848324836896185e-06, "loss": -0.0007, "num_tokens": 1619442.0, "reward": 8.749990463256836, "reward_std": 8.362157821655273, "rewards/fitness_reward/mean": 5.2926411628723145, "rewards/fitness_reward/std": 4.387931823730469, "rewards/kidney_reward/mean": 1.7029788494110107, "rewards/kidney_reward/std": 1.8882851600646973, "rewards/length2tails_reward/mean": 0.6900187134742737, "rewards/length2tails_reward/std": 0.3773564100265503, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.5853676795959473, "rewards/thermo_reward/std": 2.8135106563568115, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 268.25, "completions/mean_terminated_length": 268.25, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 0.09201871184632182, "epoch": 0.376, "frac_reward_zero_std": 0.0, "grad_norm": 0.19163833558559418, "learning_rate": 1.9846091879600123e-06, "loss": -0.0053, "num_tokens": 1628058.0, "reward": 7.008411884307861, "reward_std": 10.034730911254883, "rewards/fitness_reward/mean": 4.643359661102295, "rewards/fitness_reward/std": 4.732275009155273, "rewards/kidney_reward/mean": 1.0078006982803345, "rewards/kidney_reward/std": 2.456132411956787, "rewards/length2tails_reward/mean": 0.5096907615661621, "rewards/length2tails_reward/std": 0.4166960120201111, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.2062824964523315, "rewards/thermo_reward/std": 3.2689130306243896, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.625, "completions/mean_terminated_length": 270.625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10740740410983562, "epoch": 0.378, "frac_reward_zero_std": 0.0, "grad_norm": 0.0717017725110054, "learning_rate": 1.9843842732879912e-06, "loss": 0.0008, "num_tokens": 1636750.0, "reward": 8.669443130493164, "reward_std": 8.252121925354004, "rewards/fitness_reward/mean": 5.560091018676758, "rewards/fitness_reward/std": 4.254537105560303, "rewards/kidney_reward/mean": 1.642980933189392, "rewards/kidney_reward/std": 2.0037031173706055, "rewards/length2tails_reward/mean": 0.6206031441688538, "rewards/length2tails_reward/std": 0.38836368918418884, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.3043103218078613, "rewards/thermo_reward/std": 2.909794330596924, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 270.90625, "completions/mean_terminated_length": 270.90625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09697377681732178, "epoch": 0.38, "frac_reward_zero_std": 0.0, "grad_norm": 0.10256364941596985, "learning_rate": 1.9841577400433715e-06, "loss": -0.0019, "num_tokens": 1645451.0, "reward": 11.645931243896484, "reward_std": 4.940904140472412, "rewards/fitness_reward/mean": 6.897823810577393, "rewards/fitness_reward/std": 2.0551373958587646, "rewards/kidney_reward/mean": 2.0693302154541016, "rewards/kidney_reward/std": 1.310836911201477, "rewards/length2tails_reward/mean": 0.588446855545044, "rewards/length2tails_reward/std": 0.38965609669685364, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.5199332237243652, "rewards/thermo_reward/std": 2.15556001663208, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 265.5625, "completions/mean_terminated_length": 265.5625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.1106615662574768, "epoch": 0.382, "frac_reward_zero_std": 0.0, "grad_norm": 0.5850204229354858, "learning_rate": 1.9839295885986295e-06, "loss": -0.0808, "num_tokens": 1653981.0, "reward": 10.428827285766602, "reward_std": 6.0077056884765625, "rewards/fitness_reward/mean": 6.506265640258789, "rewards/fitness_reward/std": 2.7486143112182617, "rewards/kidney_reward/mean": 2.0006251335144043, "rewards/kidney_reward/std": 1.3787603378295898, "rewards/length2tails_reward/mean": 0.6331422328948975, "rewards/length2tails_reward/std": 0.37140002846717834, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.7586220502853394, "rewards/thermo_reward/std": 2.7680184841156006, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.90625, "completions/mean_terminated_length": 269.90625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10634398553520441, "epoch": 0.384, "frac_reward_zero_std": 0.0, "grad_norm": 0.07652062177658081, "learning_rate": 1.9836998193289038e-06, "loss": -0.0068, "num_tokens": 1662650.0, "reward": 10.72802448272705, "reward_std": 5.3211283683776855, "rewards/fitness_reward/mean": 6.853697299957275, "rewards/fitness_reward/std": 1.9966347217559814, "rewards/kidney_reward/mean": 1.8450117111206055, "rewards/kidney_reward/std": 1.6589022874832153, "rewards/length2tails_reward/mean": 0.5354677438735962, "rewards/length2tails_reward/std": 0.4196150004863739, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.875769019126892, "rewards/thermo_reward/std": 2.499567985534668, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.875, "completions/mean_terminated_length": 271.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.1068008397705853, "epoch": 0.386, "frac_reward_zero_std": 0.0, "grad_norm": 0.1525895595550537, "learning_rate": 1.9834684326119915e-06, "loss": -0.0025, "num_tokens": 1671382.0, "reward": 8.363931655883789, "reward_std": 8.365130424499512, "rewards/fitness_reward/mean": 5.590615272521973, "rewards/fitness_reward/std": 4.040146350860596, "rewards/kidney_reward/mean": 1.3528531789779663, "rewards/kidney_reward/std": 2.032255172729492, "rewards/length2tails_reward/mean": 0.6880743503570557, "rewards/length2tails_reward/std": 0.36803537607192993, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.2516562938690186, "rewards/thermo_reward/std": 3.0139715671539307, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.90625, "completions/mean_terminated_length": 272.90625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.10786493681371212, "epoch": 0.388, "frac_reward_zero_std": 0.0, "grad_norm": 0.1900808960199356, "learning_rate": 1.98323542882835e-06, "loss": 0.0009, "num_tokens": 1680147.0, "reward": 6.575066566467285, "reward_std": 9.77808666229248, "rewards/fitness_reward/mean": 4.85727071762085, "rewards/fitness_reward/std": 4.808586597442627, "rewards/kidney_reward/mean": 1.0090776681900024, "rewards/kidney_reward/std": 2.365453004837036, "rewards/length2tails_reward/mean": 0.7621837258338928, "rewards/length2tails_reward/std": 0.32337686419487, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.532500147819519, "rewards/thermo_reward/std": 3.1945226192474365, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.8125, "completions/mean_terminated_length": 270.8125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10732665937393904, "epoch": 0.39, "frac_reward_zero_std": 0.0, "grad_norm": 0.06709634512662888, "learning_rate": 1.9830008083610964e-06, "loss": -0.0003, "num_tokens": 1688845.0, "reward": 8.791130065917969, "reward_std": 8.393945693969727, "rewards/fitness_reward/mean": 5.577649116516113, "rewards/fitness_reward/std": 4.212261199951172, "rewards/kidney_reward/mean": 1.5661720037460327, "rewards/kidney_reward/std": 1.9226521253585815, "rewards/length2tails_reward/mean": 0.6098302602767944, "rewards/length2tails_reward/std": 0.40974730253219604, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.4863260984420776, "rewards/thermo_reward/std": 2.968444585800171, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.6875, "completions/mean_terminated_length": 270.6875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10089261550456285, "epoch": 0.392, "frac_reward_zero_std": 0.0, "grad_norm": 0.14002718031406403, "learning_rate": 1.982764571596004e-06, "loss": 0.0004, "num_tokens": 1697539.0, "reward": 8.121866226196289, "reward_std": 8.40750789642334, "rewards/fitness_reward/mean": 5.079010009765625, "rewards/fitness_reward/std": 4.419788360595703, "rewards/kidney_reward/mean": 1.4961414337158203, "rewards/kidney_reward/std": 1.9750374555587769, "rewards/length2tails_reward/mean": 0.6118898391723633, "rewards/length2tails_reward/std": 0.39571040868759155, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.3855266571044922, "rewards/thermo_reward/std": 2.767749786376953, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.65625, "completions/mean_terminated_length": 269.65625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0964017822407186, "epoch": 0.394, "frac_reward_zero_std": 0.0, "grad_norm": 0.1263321042060852, "learning_rate": 1.9825267189215047e-06, "loss": -0.0012, "num_tokens": 1706200.0, "reward": 8.513752937316895, "reward_std": 8.518855094909668, "rewards/fitness_reward/mean": 5.270111083984375, "rewards/fitness_reward/std": 4.430210590362549, "rewards/kidney_reward/mean": 1.7094039916992188, "rewards/kidney_reward/std": 1.8589138984680176, "rewards/length2tails_reward/mean": 0.5403980612754822, "rewards/length2tails_reward/std": 0.3900211453437805, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.3801982402801514, "rewards/thermo_reward/std": 2.656468629837036, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.4375, "completions/mean_terminated_length": 272.4375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11020283959805965, "epoch": 0.396, "frac_reward_zero_std": 0.0, "grad_norm": 0.06863213330507278, "learning_rate": 1.9822872507286887e-06, "loss": -0.0007, "num_tokens": 1714950.0, "reward": 7.311960697174072, "reward_std": 9.5090970993042, "rewards/fitness_reward/mean": 4.814295768737793, "rewards/fitness_reward/std": 4.636486530303955, "rewards/kidney_reward/mean": 1.1605840921401978, "rewards/kidney_reward/std": 2.2890994548797607, "rewards/length2tails_reward/mean": 0.7226451635360718, "rewards/length2tails_reward/std": 0.33867594599723816, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.1648168563842773, "rewards/thermo_reward/std": 3.1884403228759766, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.875, "completions/mean_terminated_length": 269.875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.109826167114079, "epoch": 0.398, "frac_reward_zero_std": 0.0, "grad_norm": 0.20804084837436676, "learning_rate": 1.9820461674113e-06, "loss": -0.0033, "num_tokens": 1723618.0, "reward": 7.401235580444336, "reward_std": 8.667125701904297, "rewards/fitness_reward/mean": 5.332418441772461, "rewards/fitness_reward/std": 4.303309440612793, "rewards/kidney_reward/mean": 1.2573118209838867, "rewards/kidney_reward/std": 2.1245853900909424, "rewards/length2tails_reward/mean": 0.5858131051063538, "rewards/length2tails_reward/std": 0.3964066207408905, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.6529242992401123, "rewards/thermo_reward/std": 3.1356236934661865, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.46875, "completions/mean_terminated_length": 270.46875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0995940687134862, "epoch": 0.4, "frac_reward_zero_std": 0.0, "grad_norm": 0.14407816529273987, "learning_rate": 1.9818034693657404e-06, "loss": -0.0018, "num_tokens": 1732305.0, "reward": 10.46460247039795, "reward_std": 7.0780768394470215, "rewards/fitness_reward/mean": 6.211520195007324, "rewards/fitness_reward/std": 3.2666735649108887, "rewards/kidney_reward/mean": 1.8888168334960938, "rewards/kidney_reward/std": 1.7426211833953857, "rewards/length2tails_reward/mean": 0.5859594345092773, "rewards/length2tails_reward/std": 0.3815963566303253, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.2056691646575928, "rewards/thermo_reward/std": 2.506162405014038, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.5625, "completions/mean_terminated_length": 269.5625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09627026505768299, "epoch": 0.402, "frac_reward_zero_std": 0.0, "grad_norm": 0.1052074134349823, "learning_rate": 1.9815591569910653e-06, "loss": -0.0033, "num_tokens": 1740963.0, "reward": 8.757131576538086, "reward_std": 8.111889839172363, "rewards/fitness_reward/mean": 5.644378185272217, "rewards/fitness_reward/std": 3.914625644683838, "rewards/kidney_reward/mean": 1.5931066274642944, "rewards/kidney_reward/std": 1.9486136436462402, "rewards/length2tails_reward/mean": 0.5394684076309204, "rewards/length2tails_reward/std": 0.38302308320999146, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.365700364112854, "rewards/thermo_reward/std": 2.8859643936157227, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.1875, "completions/mean_terminated_length": 270.1875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10567681677639484, "epoch": 0.404, "frac_reward_zero_std": 0.0, "grad_norm": 0.08214101195335388, "learning_rate": 1.9813132306889854e-06, "loss": -0.0019, "num_tokens": 1749641.0, "reward": 11.298721313476562, "reward_std": 4.785417079925537, "rewards/fitness_reward/mean": 7.003148555755615, "rewards/fitness_reward/std": 2.025360345840454, "rewards/kidney_reward/mean": 2.3104147911071777, "rewards/kidney_reward/std": 1.040230393409729, "rewards/length2tails_reward/mean": 0.5893241763114929, "rewards/length2tails_reward/std": 0.381794810295105, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.8262252807617188, "rewards/thermo_reward/std": 2.564194679260254, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.53125, "completions/mean_terminated_length": 270.53125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.1086399257183075, "epoch": 0.406, "frac_reward_zero_std": 0.0, "grad_norm": 0.111503966152668, "learning_rate": 1.981065690863864e-06, "loss": 0.0013, "num_tokens": 1758330.0, "reward": 8.51333236694336, "reward_std": 8.681916236877441, "rewards/fitness_reward/mean": 5.229581832885742, "rewards/fitness_reward/std": 4.509924411773682, "rewards/kidney_reward/mean": 1.6099677085876465, "rewards/kidney_reward/std": 2.048271656036377, "rewards/length2tails_reward/mean": 0.6296185255050659, "rewards/length2tails_reward/std": 0.333383709192276, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.5108214616775513, "rewards/thermo_reward/std": 2.850874900817871, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.96875, "completions/mean_terminated_length": 269.96875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09983132593333721, "epoch": 0.408, "frac_reward_zero_std": 0.0, "grad_norm": 0.0952368900179863, "learning_rate": 1.9808165379227195e-06, "loss": 0.001, "num_tokens": 1767001.0, "reward": 9.538595199584961, "reward_std": 8.022995948791504, "rewards/fitness_reward/mean": 6.058516502380371, "rewards/fitness_reward/std": 3.590620756149292, "rewards/kidney_reward/mean": 1.707038164138794, "rewards/kidney_reward/std": 2.0653762817382812, "rewards/length2tails_reward/mean": 0.5839086174964905, "rewards/length2tails_reward/std": 0.3778667449951172, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.6146492958068848, "rewards/thermo_reward/std": 2.8776142597198486, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.03125, "completions/mean_terminated_length": 269.03125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10124222468584776, "epoch": 0.41, "frac_reward_zero_std": 0.0, "grad_norm": 0.0786767452955246, "learning_rate": 1.98056577227522e-06, "loss": 0.0004, "num_tokens": 1775642.0, "reward": 8.277677536010742, "reward_std": 8.470403671264648, "rewards/fitness_reward/mean": 5.537692070007324, "rewards/fitness_reward/std": 3.9734997749328613, "rewards/kidney_reward/mean": 1.4755680561065674, "rewards/kidney_reward/std": 2.1754350662231445, "rewards/length2tails_reward/mean": 0.4818817973136902, "rewards/length2tails_reward/std": 0.4087766408920288, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.1162288188934326, "rewards/thermo_reward/std": 2.740323781967163, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.875, "completions/mean_terminated_length": 270.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09372684638947248, "epoch": 0.412, "frac_reward_zero_std": 0.0, "grad_norm": 0.07380766421556473, "learning_rate": 1.9803133943336874e-06, "loss": -0.0034, "num_tokens": 1784342.0, "reward": 7.565323352813721, "reward_std": 8.948416709899902, "rewards/fitness_reward/mean": 4.959705352783203, "rewards/fitness_reward/std": 4.494466304779053, "rewards/kidney_reward/mean": 1.414535403251648, "rewards/kidney_reward/std": 2.106023073196411, "rewards/length2tails_reward/mean": 0.596523642539978, "rewards/length2tails_reward/std": 0.43391671776771545, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 1.0376806259155273, "rewards/thermo_reward/std": 2.8510472774505615, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.9375, "completions/mean_terminated_length": 269.9375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10692477971315384, "epoch": 0.414, "frac_reward_zero_std": 0.0, "grad_norm": 0.11711820214986801, "learning_rate": 1.9800594045130928e-06, "loss": 0.0029, "num_tokens": 1793012.0, "reward": 8.848175048828125, "reward_std": 8.377004623413086, "rewards/fitness_reward/mean": 5.87990140914917, "rewards/fitness_reward/std": 3.820521593093872, "rewards/kidney_reward/mean": 1.4009602069854736, "rewards/kidney_reward/std": 2.222740888595581, "rewards/length2tails_reward/mean": 0.5454018115997314, "rewards/length2tails_reward/std": 0.3798547387123108, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.412773847579956, "rewards/thermo_reward/std": 3.057025194168091, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.8125, "completions/mean_terminated_length": 270.8125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11469333432614803, "epoch": 0.416, "frac_reward_zero_std": 0.0, "grad_norm": 0.07612542062997818, "learning_rate": 1.97980380323106e-06, "loss": -0.0047, "num_tokens": 1801710.0, "reward": 7.182913780212402, "reward_std": 9.170951843261719, "rewards/fitness_reward/mean": 4.917283535003662, "rewards/fitness_reward/std": 4.572504997253418, "rewards/kidney_reward/mean": 1.20510733127594, "rewards/kidney_reward/std": 2.1817445755004883, "rewards/length2tails_reward/mean": 0.6531508564949036, "rewards/length2tails_reward/std": 0.3536904454231262, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.895207941532135, "rewards/thermo_reward/std": 2.9525623321533203, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.75, "completions/mean_terminated_length": 270.75, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11213183123618364, "epoch": 0.418, "frac_reward_zero_std": 0.0, "grad_norm": 0.07885367423295975, "learning_rate": 1.97954659090786e-06, "loss": -0.0038, "num_tokens": 1810406.0, "reward": 6.919522762298584, "reward_std": 8.68114948272705, "rewards/fitness_reward/mean": 4.821925163269043, "rewards/fitness_reward/std": 4.653115272521973, "rewards/kidney_reward/mean": 1.1035808324813843, "rewards/kidney_reward/std": 2.052344560623169, "rewards/length2tails_reward/mean": 0.632168173789978, "rewards/length2tails_reward/std": 0.3854258358478546, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.8307993412017822, "rewards/thermo_reward/std": 2.9898102283477783, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.34375, "completions/mean_terminated_length": 269.34375, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.10570395179092884, "epoch": 0.42, "frac_reward_zero_std": 0.0, "grad_norm": 0.2664085030555725, "learning_rate": 1.9792877679664147e-06, "loss": -0.0205, "num_tokens": 1819057.0, "reward": 7.419079303741455, "reward_std": 8.691427230834961, "rewards/fitness_reward/mean": 5.043792247772217, "rewards/fitness_reward/std": 4.39654541015625, "rewards/kidney_reward/mean": 1.1797351837158203, "rewards/kidney_reward/std": 2.1847445964813232, "rewards/length2tails_reward/mean": 0.6589158773422241, "rewards/length2tails_reward/std": 0.38090842962265015, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.0296605825424194, "rewards/thermo_reward/std": 2.942087173461914, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 268.96875, "completions/mean_terminated_length": 268.96875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.095816302113235, "epoch": 0.422, "frac_reward_zero_std": 0.0, "grad_norm": 0.08715507388114929, "learning_rate": 1.9790273348322927e-06, "loss": 0.0019, "num_tokens": 1827696.0, "reward": 8.556151390075684, "reward_std": 8.206366539001465, "rewards/fitness_reward/mean": 5.665445327758789, "rewards/fitness_reward/std": 4.011801719665527, "rewards/kidney_reward/mean": 1.4945118427276611, "rewards/kidney_reward/std": 1.9173368215560913, "rewards/length2tails_reward/mean": 0.4852709472179413, "rewards/length2tails_reward/std": 0.39102447032928467, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.2476671934127808, "rewards/thermo_reward/std": 3.026831865310669, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.28125, "completions/mean_terminated_length": 270.28125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10708794835954905, "epoch": 0.424, "frac_reward_zero_std": 0.0, "grad_norm": 0.0768938884139061, "learning_rate": 1.9787652919337115e-06, "loss": -0.001, "num_tokens": 1836377.0, "reward": 11.526771545410156, "reward_std": 5.237046718597412, "rewards/fitness_reward/mean": 6.383553504943848, "rewards/fitness_reward/std": 3.095924139022827, "rewards/kidney_reward/mean": 2.3126468658447266, "rewards/kidney_reward/std": 1.0282318592071533, "rewards/length2tails_reward/mean": 0.6096439361572266, "rewards/length2tails_reward/std": 0.3747730553150177, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.6696066856384277, "rewards/thermo_reward/std": 2.085958957672119, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 266.75, "completions/mean_terminated_length": 266.75, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.10727542918175459, "epoch": 0.426, "frac_reward_zero_std": 0.0, "grad_norm": 0.28065893054008484, "learning_rate": 1.9785016397015338e-06, "loss": -0.0418, "num_tokens": 1844945.0, "reward": 6.286587715148926, "reward_std": 9.712299346923828, "rewards/fitness_reward/mean": 4.788792610168457, "rewards/fitness_reward/std": 4.699509143829346, "rewards/kidney_reward/mean": 0.8297417163848877, "rewards/kidney_reward/std": 2.447777032852173, "rewards/length2tails_reward/mean": 0.7117043733596802, "rewards/length2tails_reward/std": 0.34659260511398315, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.4968832731246948, "rewards/thermo_reward/std": 3.415356397628784, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.09375, "completions/mean_terminated_length": 270.09375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10078845452517271, "epoch": 0.428, "frac_reward_zero_std": 0.0, "grad_norm": 0.08422592282295227, "learning_rate": 1.9782363785692705e-06, "loss": 0.0026, "num_tokens": 1853620.0, "reward": 6.68724250793457, "reward_std": 8.57685661315918, "rewards/fitness_reward/mean": 5.163091659545898, "rewards/fitness_reward/std": 4.379825592041016, "rewards/kidney_reward/mean": 1.1229581832885742, "rewards/kidney_reward/std": 2.124249219894409, "rewards/length2tails_reward/mean": 0.5803974866867065, "rewards/length2tails_reward/std": 0.40740472078323364, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.24315303564071655, "rewards/thermo_reward/std": 2.994739294052124, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.15625, "completions/mean_terminated_length": 270.15625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10598285868763924, "epoch": 0.43, "frac_reward_zero_std": 0.0, "grad_norm": 0.15869423747062683, "learning_rate": 1.9779695089730764e-06, "loss": -0.003, "num_tokens": 1862297.0, "reward": 8.356054306030273, "reward_std": 9.389469146728516, "rewards/fitness_reward/mean": 5.200437068939209, "rewards/fitness_reward/std": 4.589191436767578, "rewards/kidney_reward/mean": 1.5199666023254395, "rewards/kidney_reward/std": 2.305187702178955, "rewards/length2tails_reward/mean": 0.5944569110870361, "rewards/length2tails_reward/std": 0.35584756731987, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.4762049913406372, "rewards/thermo_reward/std": 2.8045899868011475, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.34375, "completions/mean_terminated_length": 271.34375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.11291093192994595, "epoch": 0.432, "frac_reward_zero_std": 0.0, "grad_norm": 0.211589053273201, "learning_rate": 1.9777010313517514e-06, "loss": 0.0001, "num_tokens": 1871012.0, "reward": 9.170920372009277, "reward_std": 7.176021575927734, "rewards/fitness_reward/mean": 6.2553839683532715, "rewards/fitness_reward/std": 3.310086250305176, "rewards/kidney_reward/mean": 1.620316505432129, "rewards/kidney_reward/std": 1.8193150758743286, "rewards/length2tails_reward/mean": 0.6244357824325562, "rewards/length2tails_reward/std": 0.370291143655777, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.1327762603759766, "rewards/thermo_reward/std": 2.8469913005828857, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.375, "completions/mean_terminated_length": 270.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09871920570731163, "epoch": 0.434, "frac_reward_zero_std": 0.0, "grad_norm": 0.07975756376981735, "learning_rate": 1.9774309461467397e-06, "loss": -0.0064, "num_tokens": 1879696.0, "reward": 9.119773864746094, "reward_std": 7.26805305480957, "rewards/fitness_reward/mean": 5.79979133605957, "rewards/fitness_reward/std": 3.581414222717285, "rewards/kidney_reward/mean": 1.6331734657287598, "rewards/kidney_reward/std": 1.749567985534668, "rewards/length2tails_reward/mean": 0.5846536159515381, "rewards/length2tails_reward/std": 0.39755120873451233, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.528343915939331, "rewards/thermo_reward/std": 2.701770305633545, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.875, "completions/mean_terminated_length": 269.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10855805594474077, "epoch": 0.436, "frac_reward_zero_std": 0.0, "grad_norm": 0.061504144221544266, "learning_rate": 1.9771592538021285e-06, "loss": 0.0001, "num_tokens": 1888364.0, "reward": 11.166021347045898, "reward_std": 5.292967319488525, "rewards/fitness_reward/mean": 6.575160503387451, "rewards/fitness_reward/std": 2.870858669281006, "rewards/kidney_reward/mean": 2.0742831230163574, "rewards/kidney_reward/std": 1.3122246265411377, "rewards/length2tails_reward/mean": 0.5768504738807678, "rewards/length2tails_reward/std": 0.3807322382926941, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.3588924407958984, "rewards/thermo_reward/std": 2.110995292663574, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 270.96875, "completions/mean_terminated_length": 270.96875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09409010969102383, "epoch": 0.438, "frac_reward_zero_std": 0.0, "grad_norm": 0.09661436080932617, "learning_rate": 1.9768859547646473e-06, "loss": 0.0003, "num_tokens": 1897067.0, "reward": 7.066924095153809, "reward_std": 9.021658897399902, "rewards/fitness_reward/mean": 5.256416320800781, "rewards/fitness_reward/std": 4.2729172706604, "rewards/kidney_reward/mean": 1.1841405630111694, "rewards/kidney_reward/std": 2.26008939743042, "rewards/length2tails_reward/mean": 0.5950348973274231, "rewards/length2tails_reward/std": 0.4093562662601471, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.466863214969635, "rewards/thermo_reward/std": 3.3025662899017334, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.75, "completions/mean_terminated_length": 269.75, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10664527304470539, "epoch": 0.44, "frac_reward_zero_std": 0.0, "grad_norm": 0.11443617194890976, "learning_rate": 1.9766110494836685e-06, "loss": -0.0004, "num_tokens": 1905731.0, "reward": 7.3894758224487305, "reward_std": 8.807182312011719, "rewards/fitness_reward/mean": 5.3923797607421875, "rewards/fitness_reward/std": 4.229556560516357, "rewards/kidney_reward/mean": 1.0697729587554932, "rewards/kidney_reward/std": 2.2666690349578857, "rewards/length2tails_reward/mean": 0.5529941916465759, "rewards/length2tails_reward/std": 0.3819185793399811, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.7720230221748352, "rewards/thermo_reward/std": 3.1068079471588135, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.5, "completions/mean_terminated_length": 270.5, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11153780948370695, "epoch": 0.442, "frac_reward_zero_std": 0.0, "grad_norm": 0.1167251318693161, "learning_rate": 1.976334538411204e-06, "loss": 0.0033, "num_tokens": 1914419.0, "reward": 7.049777030944824, "reward_std": 9.890249252319336, "rewards/fitness_reward/mean": 4.516417980194092, "rewards/fitness_reward/std": 5.013523101806641, "rewards/kidney_reward/mean": 1.3213751316070557, "rewards/kidney_reward/std": 2.31786847114563, "rewards/length2tails_reward/mean": 0.6002007126808167, "rewards/length2tails_reward/std": 0.410892128944397, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.0519638061523438, "rewards/thermo_reward/std": 3.2038497924804688, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.5, "completions/mean_terminated_length": 270.5, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09985514171421528, "epoch": 0.444, "frac_reward_zero_std": 0.0, "grad_norm": 0.06074235960841179, "learning_rate": 1.9760564220019072e-06, "loss": -0.0024, "num_tokens": 1923107.0, "reward": 8.840917587280273, "reward_std": 8.136581420898438, "rewards/fitness_reward/mean": 5.285778999328613, "rewards/fitness_reward/std": 4.406919956207275, "rewards/kidney_reward/mean": 1.6758543252944946, "rewards/kidney_reward/std": 1.9091429710388184, "rewards/length2tails_reward/mean": 0.5887254476547241, "rewards/length2tails_reward/std": 0.41157737374305725, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 1.7266616821289062, "rewards/thermo_reward/std": 2.6433470249176025, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 270.09375, "completions/mean_terminated_length": 270.09375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10312576498836279, "epoch": 0.446, "frac_reward_zero_std": 0.0, "grad_norm": 0.060146745294332504, "learning_rate": 1.9757767007130704e-06, "loss": 0.0018, "num_tokens": 1931782.0, "reward": 9.458915710449219, "reward_std": 7.045760631561279, "rewards/fitness_reward/mean": 6.186161518096924, "rewards/fitness_reward/std": 3.3415544033050537, "rewards/kidney_reward/mean": 1.8518340587615967, "rewards/kidney_reward/std": 1.664615273475647, "rewards/length2tails_reward/mean": 0.5694063901901245, "rewards/length2tails_reward/std": 0.3994041979312897, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.2639801502227783, "rewards/thermo_reward/std": 2.8834218978881836, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 267.625, "completions/mean_terminated_length": 267.625, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.1007047207094729, "epoch": 0.448, "frac_reward_zero_std": 0.0, "grad_norm": 0.12512126564979553, "learning_rate": 1.9754953750046246e-06, "loss": -0.0234, "num_tokens": 1940378.0, "reward": 8.606588363647461, "reward_std": 8.08462905883789, "rewards/fitness_reward/mean": 5.883886337280273, "rewards/fitness_reward/std": 3.8256380558013916, "rewards/kidney_reward/mean": 1.5120166540145874, "rewards/kidney_reward/std": 2.1992740631103516, "rewards/length2tails_reward/mean": 0.5627999901771545, "rewards/length2tails_reward/std": 0.3526705205440521, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.0544053316116333, "rewards/thermo_reward/std": 2.9035634994506836, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.46875, "completions/mean_terminated_length": 269.46875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.09678153228014708, "epoch": 0.45, "frac_reward_zero_std": 0.0, "grad_norm": 0.18851271271705627, "learning_rate": 1.9752124453391404e-06, "loss": -0.0008, "num_tokens": 1949033.0, "reward": 6.986786842346191, "reward_std": 8.987977027893066, "rewards/fitness_reward/mean": 5.2845048904418945, "rewards/fitness_reward/std": 4.396836280822754, "rewards/kidney_reward/mean": 1.0947299003601074, "rewards/kidney_reward/std": 2.2589919567108154, "rewards/length2tails_reward/mean": 0.5387097597122192, "rewards/length2tails_reward/std": 0.40030577778816223, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.45368072390556335, "rewards/thermo_reward/std": 2.9777472019195557, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.65625, "completions/mean_terminated_length": 270.65625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11522582359611988, "epoch": 0.452, "frac_reward_zero_std": 0.0, "grad_norm": 0.0713803768157959, "learning_rate": 1.9749279121818236e-06, "loss": 0.0016, "num_tokens": 1957726.0, "reward": 11.161405563354492, "reward_std": 7.142817974090576, "rewards/fitness_reward/mean": 6.314478397369385, "rewards/fitness_reward/std": 3.3092589378356934, "rewards/kidney_reward/mean": 2.069758415222168, "rewards/kidney_reward/std": 1.672605276107788, "rewards/length2tails_reward/mean": 0.6350034475326538, "rewards/length2tails_reward/std": 0.3509335219860077, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.6136674880981445, "rewards/thermo_reward/std": 2.301509141921997, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 270.375, "completions/mean_terminated_length": 270.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10247354488819838, "epoch": 0.454, "frac_reward_zero_std": 0.0, "grad_norm": 0.08207889646291733, "learning_rate": 1.9746417760005176e-06, "loss": -0.0077, "num_tokens": 1966410.0, "reward": 11.169610023498535, "reward_std": 4.311980724334717, "rewards/fitness_reward/mean": 6.826416969299316, "rewards/fitness_reward/std": 2.1413729190826416, "rewards/kidney_reward/mean": 2.0623292922973633, "rewards/kidney_reward/std": 1.2271078824996948, "rewards/length2tails_reward/mean": 0.5567665100097656, "rewards/length2tails_reward/std": 0.38493385910987854, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.125187873840332, "rewards/thermo_reward/std": 1.8756372928619385, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.46875, "completions/mean_terminated_length": 269.46875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10469500161707401, "epoch": 0.456, "frac_reward_zero_std": 0.0, "grad_norm": 0.08091660588979721, "learning_rate": 1.9743540372657017e-06, "loss": -0.0067, "num_tokens": 1975065.0, "reward": 9.968902587890625, "reward_std": 6.146872520446777, "rewards/fitness_reward/mean": 5.6489458084106445, "rewards/fitness_reward/std": 4.059178829193115, "rewards/kidney_reward/mean": 2.179636001586914, "rewards/kidney_reward/std": 1.0131422281265259, "rewards/length2tails_reward/mean": 0.5025430917739868, "rewards/length2tails_reward/std": 0.36392441391944885, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 1.9963165521621704, "rewards/thermo_reward/std": 2.2949652671813965, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.03125, "completions/mean_terminated_length": 272.03125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.1013132524676621, "epoch": 0.458, "frac_reward_zero_std": 0.0, "grad_norm": 0.2698652446269989, "learning_rate": 1.97406469645049e-06, "loss": 0.0001, "num_tokens": 1983802.0, "reward": 9.640462875366211, "reward_std": 7.675972938537598, "rewards/fitness_reward/mean": 5.40672492980957, "rewards/fitness_reward/std": 4.173493385314941, "rewards/kidney_reward/mean": 1.7126578092575073, "rewards/kidney_reward/std": 1.7750372886657715, "rewards/length2tails_reward/mean": 0.6748954057693481, "rewards/length2tails_reward/std": 0.35597914457321167, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.353590965270996, "rewards/thermo_reward/std": 2.550793409347534, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.28125, "completions/mean_terminated_length": 270.28125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.1011765506118536, "epoch": 0.46, "frac_reward_zero_std": 0.0, "grad_norm": 0.1072259470820427, "learning_rate": 1.9737737540306303e-06, "loss": -0.007, "num_tokens": 1992483.0, "reward": 9.985023498535156, "reward_std": 6.017777919769287, "rewards/fitness_reward/mean": 6.292191982269287, "rewards/fitness_reward/std": 3.01308536529541, "rewards/kidney_reward/mean": 1.845158576965332, "rewards/kidney_reward/std": 1.4613128900527954, "rewards/length2tails_reward/mean": 0.5501976013183594, "rewards/length2tails_reward/std": 0.4069270193576813, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.692654013633728, "rewards/thermo_reward/std": 2.388899087905884, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.125, "completions/mean_terminated_length": 271.125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09331366792321205, "epoch": 0.462, "frac_reward_zero_std": 0.0, "grad_norm": 0.06806115806102753, "learning_rate": 1.9734812104845046e-06, "loss": 0.0001, "num_tokens": 2001191.0, "reward": 8.523029327392578, "reward_std": 9.589752197265625, "rewards/fitness_reward/mean": 4.993949890136719, "rewards/fitness_reward/std": 4.623663425445557, "rewards/kidney_reward/mean": 1.4517195224761963, "rewards/kidney_reward/std": 2.2691798210144043, "rewards/length2tails_reward/mean": 0.6628376245498657, "rewards/length2tails_reward/std": 0.3733203411102295, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.9110764265060425, "rewards/thermo_reward/std": 2.9277281761169434, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.375, "completions/mean_terminated_length": 270.375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10501847136765718, "epoch": 0.464, "frac_reward_zero_std": 0.0, "grad_norm": 0.06102055683732033, "learning_rate": 1.9731870662931275e-06, "loss": -0.0046, "num_tokens": 2009875.0, "reward": 10.10276985168457, "reward_std": 5.667848587036133, "rewards/fitness_reward/mean": 6.6926422119140625, "rewards/fitness_reward/std": 2.6392531394958496, "rewards/kidney_reward/mean": 1.8372772932052612, "rewards/kidney_reward/std": 1.5682625770568848, "rewards/length2tails_reward/mean": 0.5817334651947021, "rewards/length2tails_reward/std": 0.3835720121860504, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.4146769046783447, "rewards/thermo_reward/std": 2.468012571334839, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.34375, "completions/mean_terminated_length": 270.34375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10510055907070637, "epoch": 0.466, "frac_reward_zero_std": 0.0, "grad_norm": 0.09905203431844711, "learning_rate": 1.9728913219401447e-06, "loss": -0.0012, "num_tokens": 2018558.0, "reward": 8.83747673034668, "reward_std": 7.369777202606201, "rewards/fitness_reward/mean": 5.248540878295898, "rewards/fitness_reward/std": 4.345114707946777, "rewards/kidney_reward/mean": 1.7455861568450928, "rewards/kidney_reward/std": 1.5706603527069092, "rewards/length2tails_reward/mean": 0.5550670623779297, "rewards/length2tails_reward/std": 0.39944586157798767, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.6878437995910645, "rewards/thermo_reward/std": 2.5179495811462402, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.75, "completions/mean_terminated_length": 269.75, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08783813379704952, "epoch": 0.468, "frac_reward_zero_std": 0.0, "grad_norm": 0.08770688623189926, "learning_rate": 1.972593977911834e-06, "loss": 0.0015, "num_tokens": 2027222.0, "reward": 5.320065021514893, "reward_std": 9.834723472595215, "rewards/fitness_reward/mean": 3.948270082473755, "rewards/fitness_reward/std": 5.157715320587158, "rewards/kidney_reward/mean": 0.9420422911643982, "rewards/kidney_reward/std": 2.3843331336975098, "rewards/length2tails_reward/mean": 0.5566017627716064, "rewards/length2tails_reward/std": 0.38987359404563904, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.274092435836792, "rewards/thermo_reward/std": 3.1555631160736084, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.65625, "completions/mean_terminated_length": 271.65625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.1020322423428297, "epoch": 0.47, "frac_reward_zero_std": 0.0, "grad_norm": 0.04721997305750847, "learning_rate": 1.9722950346971028e-06, "loss": -0.0039, "num_tokens": 2035947.0, "reward": 10.875839233398438, "reward_std": 6.017666816711426, "rewards/fitness_reward/mean": 6.333611488342285, "rewards/fitness_reward/std": 3.2550642490386963, "rewards/kidney_reward/mean": 2.1162962913513184, "rewards/kidney_reward/std": 1.250596046447754, "rewards/length2tails_reward/mean": 0.6817765235900879, "rewards/length2tails_reward/std": 0.36646783351898193, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.25775408744812, "rewards/thermo_reward/std": 2.3996896743774414, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.625, "completions/mean_terminated_length": 270.625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11050799861550331, "epoch": 0.472, "frac_reward_zero_std": 0.0, "grad_norm": 0.07714132219552994, "learning_rate": 1.971994492787488e-06, "loss": -0.0022, "num_tokens": 2044639.0, "reward": 9.818836212158203, "reward_std": 6.177550792694092, "rewards/fitness_reward/mean": 6.414205074310303, "rewards/fitness_reward/std": 2.6555421352386475, "rewards/kidney_reward/mean": 1.8081889152526855, "rewards/kidney_reward/std": 1.7323895692825317, "rewards/length2tails_reward/mean": 0.5811498165130615, "rewards/length2tails_reward/std": 0.3792761266231537, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.438327431678772, "rewards/thermo_reward/std": 2.689751148223877, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 264.25, "completions/mean_terminated_length": 264.25, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.12198043707758188, "epoch": 0.474, "frac_reward_zero_std": 0.0, "grad_norm": 0.40211084485054016, "learning_rate": 1.971692352677155e-06, "loss": -0.1007, "num_tokens": 2053127.0, "reward": 10.964704513549805, "reward_std": 6.078287124633789, "rewards/fitness_reward/mean": 6.610565662384033, "rewards/fitness_reward/std": 2.9550764560699463, "rewards/kidney_reward/mean": 2.0635910034179688, "rewards/kidney_reward/std": 1.5378971099853516, "rewards/length2tails_reward/mean": 0.6503303050994873, "rewards/length2tails_reward/std": 0.3064909875392914, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.1255152225494385, "rewards/thermo_reward/std": 2.5424704551696777, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.34375, "completions/mean_terminated_length": 272.34375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.1105615459382534, "epoch": 0.476, "frac_reward_zero_std": 0.0, "grad_norm": 0.08003703504800797, "learning_rate": 1.9713886148628977e-06, "loss": -0.0059, "num_tokens": 2061874.0, "reward": 9.830589294433594, "reward_std": 7.20787239074707, "rewards/fitness_reward/mean": 5.873849868774414, "rewards/fitness_reward/std": 3.709421157836914, "rewards/kidney_reward/mean": 1.8575446605682373, "rewards/kidney_reward/std": 1.6075026988983154, "rewards/length2tails_reward/mean": 0.7331730127334595, "rewards/length2tails_reward/std": 0.33084240555763245, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.9258770942687988, "rewards/thermo_reward/std": 2.7858848571777344, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.90625, "completions/mean_terminated_length": 269.90625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.07857981137931347, "epoch": 0.478, "frac_reward_zero_std": 0.0, "grad_norm": 0.1605258733034134, "learning_rate": 1.971083279844136e-06, "loss": -0.0034, "num_tokens": 2070543.0, "reward": 5.80064582824707, "reward_std": 9.10144329071045, "rewards/fitness_reward/mean": 4.05497932434082, "rewards/fitness_reward/std": 5.024665355682373, "rewards/kidney_reward/mean": 0.9805687069892883, "rewards/kidney_reward/std": 2.246600866317749, "rewards/length2tails_reward/mean": 0.5228518843650818, "rewards/length2tails_reward/std": 0.40586429834365845, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.6128125190734863, "rewards/thermo_reward/std": 3.0637567043304443, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.1875, "completions/mean_terminated_length": 271.1875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.1058599092066288, "epoch": 0.48, "frac_reward_zero_std": 0.0, "grad_norm": 0.07374675571918488, "learning_rate": 1.970776348122918e-06, "loss": -0.0001, "num_tokens": 2079253.0, "reward": 8.957300186157227, "reward_std": 8.268644332885742, "rewards/fitness_reward/mean": 5.5188822746276855, "rewards/fitness_reward/std": 4.086907386779785, "rewards/kidney_reward/mean": 1.638132929801941, "rewards/kidney_reward/std": 1.9194594621658325, "rewards/length2tails_reward/mean": 0.686102569103241, "rewards/length2tails_reward/std": 0.33712905645370483, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.631675124168396, "rewards/thermo_reward/std": 2.6473958492279053, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 268.9375, "completions/mean_terminated_length": 268.9375, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "entropy": 0.09808468306437135, "epoch": 0.482, "frac_reward_zero_std": 0.0, "grad_norm": 0.10668472200632095, "learning_rate": 1.9704678202039145e-06, "loss": -0.0018, "num_tokens": 2087891.0, "reward": 8.597445487976074, "reward_std": 8.43989372253418, "rewards/fitness_reward/mean": 5.264682769775391, "rewards/fitness_reward/std": 4.307192802429199, "rewards/kidney_reward/mean": 1.4983744621276855, "rewards/kidney_reward/std": 1.901678442955017, "rewards/length2tails_reward/mean": 0.612869381904602, "rewards/length2tails_reward/std": 0.3913209140300751, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.6731013059616089, "rewards/thermo_reward/std": 2.823862314224243, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.09375, "completions/mean_terminated_length": 269.09375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.09199962671846151, "epoch": 0.484, "frac_reward_zero_std": 0.0, "grad_norm": 0.11633557081222534, "learning_rate": 1.970157696594423e-06, "loss": -0.0002, "num_tokens": 2096534.0, "reward": 7.774277687072754, "reward_std": 8.003270149230957, "rewards/fitness_reward/mean": 5.301121711730957, "rewards/fitness_reward/std": 4.235241413116455, "rewards/kidney_reward/mean": 1.5189933776855469, "rewards/kidney_reward/std": 1.8085318803787231, "rewards/length2tails_reward/mean": 0.4761810600757599, "rewards/length2tails_reward/std": 0.3997570872306824, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 0.8127946257591248, "rewards/thermo_reward/std": 2.9532229900360107, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.75, "completions/mean_terminated_length": 270.75, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10292203165590763, "epoch": 0.486, "frac_reward_zero_std": 0.0, "grad_norm": 0.16641299426555634, "learning_rate": 1.9698459778043624e-06, "loss": 0.0031, "num_tokens": 2105230.0, "reward": 7.040865898132324, "reward_std": 9.38436508178711, "rewards/fitness_reward/mean": 5.185315132141113, "rewards/fitness_reward/std": 4.608910083770752, "rewards/kidney_reward/mean": 1.0840166807174683, "rewards/kidney_reward/std": 2.2991762161254883, "rewards/length2tails_reward/mean": 0.634776771068573, "rewards/length2tails_reward/std": 0.3302549719810486, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.6080567836761475, "rewards/thermo_reward/std": 3.2210071086883545, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 270.46875, "completions/mean_terminated_length": 270.46875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11193057429045439, "epoch": 0.488, "frac_reward_zero_std": 0.0, "grad_norm": 0.09709324687719345, "learning_rate": 1.9695326643462775e-06, "loss": -0.0052, "num_tokens": 2113917.0, "reward": 10.90302848815918, "reward_std": 4.896656036376953, "rewards/fitness_reward/mean": 6.579126358032227, "rewards/fitness_reward/std": 2.6416797637939453, "rewards/kidney_reward/mean": 2.070417881011963, "rewards/kidney_reward/std": 1.136030912399292, "rewards/length2tails_reward/mean": 0.5819152593612671, "rewards/length2tails_reward/std": 0.390530526638031, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.0952932834625244, "rewards/thermo_reward/std": 2.45220685005188, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.03125, "completions/mean_terminated_length": 271.03125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10169417830184102, "epoch": 0.49, "frac_reward_zero_std": 0.0, "grad_norm": 0.08554291725158691, "learning_rate": 1.9692177567353328e-06, "loss": -0.0042, "num_tokens": 2122622.0, "reward": 9.377301216125488, "reward_std": 7.241844654083252, "rewards/fitness_reward/mean": 6.431892395019531, "rewards/fitness_reward/std": 3.0229907035827637, "rewards/kidney_reward/mean": 1.6262295246124268, "rewards/kidney_reward/std": 2.0052170753479004, "rewards/length2tails_reward/mean": 0.6268943548202515, "rewards/length2tails_reward/std": 0.38420239090919495, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.156490683555603, "rewards/thermo_reward/std": 3.046543836593628, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.3125, "completions/mean_terminated_length": 270.3125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11073481850326061, "epoch": 0.492, "frac_reward_zero_std": 0.0, "grad_norm": 0.18680819869041443, "learning_rate": 1.968901255489315e-06, "loss": -0.0029, "num_tokens": 2131304.0, "reward": 10.530344009399414, "reward_std": 6.960169315338135, "rewards/fitness_reward/mean": 5.975531578063965, "rewards/fitness_reward/std": 3.5728461742401123, "rewards/kidney_reward/mean": 1.9520188570022583, "rewards/kidney_reward/std": 1.632800579071045, "rewards/length2tails_reward/mean": 0.6160197257995605, "rewards/length2tails_reward/std": 0.35398852825164795, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.4411919116973877, "rewards/thermo_reward/std": 2.3167166709899902, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 270.5, "completions/mean_terminated_length": 270.5, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11004836764186621, "epoch": 0.494, "frac_reward_zero_std": 0.0, "grad_norm": 0.11552101373672485, "learning_rate": 1.968583161128631e-06, "loss": 0.0027, "num_tokens": 2139992.0, "reward": 11.550786018371582, "reward_std": 4.9059977531433105, "rewards/fitness_reward/mean": 6.833098411560059, "rewards/fitness_reward/std": 2.1057968139648438, "rewards/kidney_reward/mean": 2.2125933170318604, "rewards/kidney_reward/std": 1.2264693975448608, "rewards/length2tails_reward/mean": 0.6199765205383301, "rewards/length2tails_reward/std": 0.37857866287231445, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.343096971511841, "rewards/thermo_reward/std": 2.2181098461151123, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10392105765640736, "epoch": 0.496, "frac_reward_zero_std": 0.0, "grad_norm": 0.068658247590065, "learning_rate": 1.9682634741763067e-06, "loss": -0.0049, "num_tokens": 2148708.0, "reward": 10.745346069335938, "reward_std": 5.804399490356445, "rewards/fitness_reward/mean": 6.5791473388671875, "rewards/fitness_reward/std": 2.680927038192749, "rewards/kidney_reward/mean": 1.9059796333312988, "rewards/kidney_reward/std": 1.4528141021728516, "rewards/length2tails_reward/mean": 0.6526817679405212, "rewards/length2tails_reward/std": 0.37751561403274536, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.0949506759643555, "rewards/thermo_reward/std": 2.1236202716827393, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.5, "completions/mean_terminated_length": 270.5, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11237195134162903, "epoch": 0.498, "frac_reward_zero_std": 0.0, "grad_norm": 0.0876859650015831, "learning_rate": 1.967942195157987e-06, "loss": 0.0018, "num_tokens": 2157396.0, "reward": 9.161323547363281, "reward_std": 7.447658061981201, "rewards/fitness_reward/mean": 5.935283660888672, "rewards/fitness_reward/std": 3.6782286167144775, "rewards/kidney_reward/mean": 1.6700785160064697, "rewards/kidney_reward/std": 1.729300856590271, "rewards/length2tails_reward/mean": 0.5815059542655945, "rewards/length2tails_reward/std": 0.3755898177623749, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.3978110551834106, "rewards/thermo_reward/std": 2.6698153018951416, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 271.34375, "completions/mean_terminated_length": 271.34375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10672505851835012, "epoch": 0.5, "frac_reward_zero_std": 0.0, "grad_norm": 0.07486039400100708, "learning_rate": 1.967619324601935e-06, "loss": 0.0024, "num_tokens": 2166111.0, "reward": 9.086592674255371, "reward_std": 8.031876564025879, "rewards/fitness_reward/mean": 5.875249862670898, "rewards/fitness_reward/std": 3.831639289855957, "rewards/kidney_reward/mean": 1.4601225852966309, "rewards/kidney_reward/std": 2.075751781463623, "rewards/length2tails_reward/mean": 0.6911107897758484, "rewards/length2tails_reward/std": 0.34283214807510376, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.582108974456787, "rewards/thermo_reward/std": 2.710895299911499, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.21875, "completions/mean_terminated_length": 271.21875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10726206749677658, "epoch": 0.502, "frac_reward_zero_std": 0.0, "grad_norm": 0.1551721841096878, "learning_rate": 1.9672948630390295e-06, "loss": -0.0015, "num_tokens": 2174822.0, "reward": 9.408416748046875, "reward_std": 8.032960891723633, "rewards/fitness_reward/mean": 5.81867790222168, "rewards/fitness_reward/std": 3.987089157104492, "rewards/kidney_reward/mean": 1.5778536796569824, "rewards/kidney_reward/std": 1.9939128160476685, "rewards/length2tails_reward/mean": 0.636410117149353, "rewards/length2tails_reward/std": 0.3963748514652252, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.8482444286346436, "rewards/thermo_reward/std": 2.960555076599121, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.96875, "completions/mean_terminated_length": 269.96875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.0918376175686717, "epoch": 0.504, "frac_reward_zero_std": 0.0, "grad_norm": 0.06933335214853287, "learning_rate": 1.9669688110027664e-06, "loss": -0.0004, "num_tokens": 2183493.0, "reward": 9.228353500366211, "reward_std": 8.074507713317871, "rewards/fitness_reward/mean": 5.570281028747559, "rewards/fitness_reward/std": 4.243092060089111, "rewards/kidney_reward/mean": 1.7595927715301514, "rewards/kidney_reward/std": 1.8972914218902588, "rewards/length2tails_reward/mean": 0.5458166003227234, "rewards/length2tails_reward/std": 0.379034161567688, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.7438974380493164, "rewards/thermo_reward/std": 2.649005651473999, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.34375, "completions/mean_terminated_length": 271.34375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10837728530168533, "epoch": 0.506, "frac_reward_zero_std": 0.0, "grad_norm": 0.10028190910816193, "learning_rate": 1.966641169029256e-06, "loss": -0.0051, "num_tokens": 2192208.0, "reward": 8.974296569824219, "reward_std": 7.428702354431152, "rewards/fitness_reward/mean": 5.241325855255127, "rewards/fitness_reward/std": 4.222070693969727, "rewards/kidney_reward/mean": 1.8032042980194092, "rewards/kidney_reward/std": 1.6074918508529663, "rewards/length2tails_reward/mean": 0.643540620803833, "rewards/length2tails_reward/std": 0.3968313932418823, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 1.7716628313064575, "rewards/thermo_reward/std": 2.5006768703460693, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.5625, "completions/mean_terminated_length": 270.5625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0990586718544364, "epoch": 0.508, "frac_reward_zero_std": 0.0, "grad_norm": 0.08781884610652924, "learning_rate": 1.966311937657224e-06, "loss": -0.0021, "num_tokens": 2200898.0, "reward": 10.355709075927734, "reward_std": 5.7248406410217285, "rewards/fitness_reward/mean": 6.132905006408691, "rewards/fitness_reward/std": 3.5404434204101562, "rewards/kidney_reward/mean": 2.206814765930176, "rewards/kidney_reward/std": 1.017005443572998, "rewards/length2tails_reward/mean": 0.5586007833480835, "rewards/length2tails_reward/std": 0.4141377806663513, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.8601289987564087, "rewards/thermo_reward/std": 2.4966318607330322, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 270.21875, "completions/mean_terminated_length": 270.21875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10989784728735685, "epoch": 0.51, "frac_reward_zero_std": 0.0, "grad_norm": 0.3989095687866211, "learning_rate": 1.9659811174280078e-06, "loss": 0.002, "num_tokens": 2209577.0, "reward": 9.617646217346191, "reward_std": 7.460577964782715, "rewards/fitness_reward/mean": 5.974531650543213, "rewards/fitness_reward/std": 3.737210273742676, "rewards/kidney_reward/mean": 1.925750732421875, "rewards/kidney_reward/std": 1.7319450378417969, "rewards/length2tails_reward/mean": 0.5906080603599548, "rewards/length2tails_reward/std": 0.3697930872440338, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.558302640914917, "rewards/thermo_reward/std": 2.6359283924102783, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.96875, "completions/mean_terminated_length": 270.96875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10572863556444645, "epoch": 0.512, "frac_reward_zero_std": 0.0, "grad_norm": 0.06452004611492157, "learning_rate": 1.965648708885559e-06, "loss": -0.0064, "num_tokens": 2218280.0, "reward": 12.625532150268555, "reward_std": 3.140712261199951, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.4296531677246094, "rewards/kidney_reward/std": 0.5578335523605347, "rewards/length2tails_reward/mean": 0.6430323123931885, "rewards/length2tails_reward/std": 0.36871227622032166, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.978522300720215, "rewards/thermo_reward/std": 1.303676962852478, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.5625, "completions/mean_terminated_length": 271.5625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09694924391806126, "epoch": 0.514, "frac_reward_zero_std": 0.0, "grad_norm": 0.052169837057590485, "learning_rate": 1.965314712576439e-06, "loss": -0.0001, "num_tokens": 2227002.0, "reward": 11.298360824584961, "reward_std": 5.958661079406738, "rewards/fitness_reward/mean": 6.431214332580566, "rewards/fitness_reward/std": 2.8412742614746094, "rewards/kidney_reward/mean": 2.123246192932129, "rewards/kidney_reward/std": 1.3759034872055054, "rewards/length2tails_reward/mean": 0.6636830568313599, "rewards/length2tails_reward/std": 0.38713860511779785, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.5775318145751953, "rewards/thermo_reward/std": 1.9351935386657715, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.125, "completions/mean_terminated_length": 271.125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.09880248364061117, "epoch": 0.516, "frac_reward_zero_std": 0.0, "grad_norm": 0.06404350697994232, "learning_rate": 1.964979129049821e-06, "loss": -0.0011, "num_tokens": 2235710.0, "reward": 8.78803825378418, "reward_std": 7.820840358734131, "rewards/fitness_reward/mean": 5.565892219543457, "rewards/fitness_reward/std": 3.981142282485962, "rewards/kidney_reward/mean": 1.5922739505767822, "rewards/kidney_reward/std": 1.8315236568450928, "rewards/length2tails_reward/mean": 0.6426012516021729, "rewards/length2tails_reward/std": 0.40327534079551697, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 1.471862554550171, "rewards/thermo_reward/std": 2.617784261703491, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.21875, "completions/mean_terminated_length": 271.21875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.11636047996580601, "epoch": 0.518, "frac_reward_zero_std": 0.0, "grad_norm": 0.07931548357009888, "learning_rate": 1.964641958857489e-06, "loss": -0.0016, "num_tokens": 2244421.0, "reward": 8.441205978393555, "reward_std": 8.229249954223633, "rewards/fitness_reward/mean": 5.807088851928711, "rewards/fitness_reward/std": 3.8794262409210205, "rewards/kidney_reward/mean": 1.409515380859375, "rewards/kidney_reward/std": 2.178295850753784, "rewards/length2tails_reward/mean": 0.6650169491767883, "rewards/length2tails_reward/std": 0.35845041275024414, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.0580998659133911, "rewards/thermo_reward/std": 3.0075619220733643, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.25, "completions/mean_terminated_length": 270.25, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10207339003682137, "epoch": 0.52, "frac_reward_zero_std": 0.0, "grad_norm": 0.1335216462612152, "learning_rate": 1.964303202553833e-06, "loss": 0.0047, "num_tokens": 2253101.0, "reward": 7.660137176513672, "reward_std": 9.0372953414917, "rewards/fitness_reward/mean": 5.297764778137207, "rewards/fitness_reward/std": 4.39350700378418, "rewards/kidney_reward/mean": 1.3012959957122803, "rewards/kidney_reward/std": 2.284921169281006, "rewards/length2tails_reward/mean": 0.5699120759963989, "rewards/length2tails_reward/std": 0.3861338198184967, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.9040855169296265, "rewards/thermo_reward/std": 3.124284029006958, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 270.71875, "completions/mean_terminated_length": 270.71875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0989492554217577, "epoch": 0.522, "frac_reward_zero_std": 0.0, "grad_norm": 0.08350550383329391, "learning_rate": 1.9639628606958534e-06, "loss": -0.0036, "num_tokens": 2261796.0, "reward": 9.212039947509766, "reward_std": 7.971827030181885, "rewards/fitness_reward/mean": 5.565590858459473, "rewards/fitness_reward/std": 4.099094390869141, "rewards/kidney_reward/mean": 1.6808946132659912, "rewards/kidney_reward/std": 1.801498293876648, "rewards/length2tails_reward/mean": 0.6365346312522888, "rewards/length2tails_reward/std": 0.3803783357143402, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.8019015789031982, "rewards/thermo_reward/std": 2.71297025680542, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 267.46875, "completions/mean_terminated_length": 267.46875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.10223742201924324, "epoch": 0.524, "frac_reward_zero_std": 0.0, "grad_norm": 0.2912435829639435, "learning_rate": 1.9636209338431567e-06, "loss": -0.0394, "num_tokens": 2270387.0, "reward": 10.56210994720459, "reward_std": 7.012156009674072, "rewards/fitness_reward/mean": 6.2607808113098145, "rewards/fitness_reward/std": 3.311769485473633, "rewards/kidney_reward/mean": 1.8473119735717773, "rewards/kidney_reward/std": 1.8399372100830078, "rewards/length2tails_reward/mean": 0.5963507890701294, "rewards/length2tails_reward/std": 0.3624004125595093, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.294381618499756, "rewards/thermo_reward/std": 2.297987461090088, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.875, "completions/mean_terminated_length": 270.875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.1105208182707429, "epoch": 0.526, "frac_reward_zero_std": 0.0, "grad_norm": 0.07265937328338623, "learning_rate": 1.963277422557956e-06, "loss": 0.003, "num_tokens": 2279087.0, "reward": 10.239557266235352, "reward_std": 7.107791900634766, "rewards/fitness_reward/mean": 6.289301872253418, "rewards/fitness_reward/std": 3.386221170425415, "rewards/kidney_reward/mean": 1.9778251647949219, "rewards/kidney_reward/std": 1.6936469078063965, "rewards/length2tails_reward/mean": 0.6477591395378113, "rewards/length2tails_reward/std": 0.37626996636390686, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.8076553344726562, "rewards/thermo_reward/std": 2.515333652496338, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 270.75, "completions/mean_terminated_length": 270.75, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10514908283948898, "epoch": 0.528, "frac_reward_zero_std": 0.0, "grad_norm": 0.05402668938040733, "learning_rate": 1.962932327405069e-06, "loss": 0.0008, "num_tokens": 2287783.0, "reward": 10.216456413269043, "reward_std": 7.160484790802002, "rewards/fitness_reward/mean": 6.291948318481445, "rewards/fitness_reward/std": 3.37766695022583, "rewards/kidney_reward/mean": 1.8252959251403809, "rewards/kidney_reward/std": 1.7469253540039062, "rewards/length2tails_reward/mean": 0.6056156158447266, "rewards/length2tails_reward/std": 0.4104541540145874, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.938650131225586, "rewards/thermo_reward/std": 2.5619373321533203, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.5625, "completions/mean_terminated_length": 269.5625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09763561934232712, "epoch": 0.53, "frac_reward_zero_std": 0.0, "grad_norm": 0.06480225920677185, "learning_rate": 1.962585648951918e-06, "loss": 0.0019, "num_tokens": 2296441.0, "reward": 10.411375045776367, "reward_std": 6.475214004516602, "rewards/fitness_reward/mean": 6.272033214569092, "rewards/fitness_reward/std": 3.441664695739746, "rewards/kidney_reward/mean": 2.0811996459960938, "rewards/kidney_reward/std": 1.392430067062378, "rewards/length2tails_reward/mean": 0.5402275919914246, "rewards/length2tails_reward/std": 0.3795466125011444, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.904119849205017, "rewards/thermo_reward/std": 2.7143473625183105, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.125, "completions/mean_terminated_length": 272.125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.1107251700013876, "epoch": 0.532, "frac_reward_zero_std": 0.0, "grad_norm": 0.08268602937459946, "learning_rate": 1.962237387768529e-06, "loss": -0.0002, "num_tokens": 2305181.0, "reward": 10.285980224609375, "reward_std": 7.013757228851318, "rewards/fitness_reward/mean": 6.299373626708984, "rewards/fitness_reward/std": 3.354177713394165, "rewards/kidney_reward/mean": 2.0277373790740967, "rewards/kidney_reward/std": 1.6260619163513184, "rewards/length2tails_reward/mean": 0.7632877230644226, "rewards/length2tails_reward/std": 0.2882511615753174, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.7825400829315186, "rewards/thermo_reward/std": 2.7083804607391357, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.34375, "completions/mean_terminated_length": 270.34375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10624216496944427, "epoch": 0.534, "frac_reward_zero_std": 0.0, "grad_norm": 0.07529258728027344, "learning_rate": 1.9618875444275292e-06, "loss": -0.0061, "num_tokens": 2313864.0, "reward": 10.298490524291992, "reward_std": 5.719725131988525, "rewards/fitness_reward/mean": 5.982329368591309, "rewards/fitness_reward/std": 3.718473196029663, "rewards/kidney_reward/mean": 2.1536967754364014, "rewards/kidney_reward/std": 1.1052112579345703, "rewards/length2tails_reward/mean": 0.57701575756073, "rewards/length2tails_reward/std": 0.3976078927516937, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.0047621726989746, "rewards/thermo_reward/std": 2.1830263137817383, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.09375, "completions/mean_terminated_length": 271.09375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.1106356205418706, "epoch": 0.536, "frac_reward_zero_std": 0.0, "grad_norm": 0.07455572485923767, "learning_rate": 1.961536119504149e-06, "loss": -0.0065, "num_tokens": 2322571.0, "reward": 11.61314868927002, "reward_std": 3.800121784210205, "rewards/fitness_reward/mean": 6.880525588989258, "rewards/fitness_reward/std": 1.8558024168014526, "rewards/kidney_reward/mean": 2.280116558074951, "rewards/kidney_reward/std": 0.8632156848907471, "rewards/length2tails_reward/mean": 0.6226137280464172, "rewards/length2tails_reward/std": 0.3921652138233185, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.290245294570923, "rewards/thermo_reward/std": 1.98012113571167, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.8125, "completions/mean_terminated_length": 271.8125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11541940923780203, "epoch": 0.538, "frac_reward_zero_std": 0.0, "grad_norm": 0.0967407152056694, "learning_rate": 1.9611831135762175e-06, "loss": -0.0032, "num_tokens": 2331301.0, "reward": 8.631363868713379, "reward_std": 8.920723915100098, "rewards/fitness_reward/mean": 5.311345100402832, "rewards/fitness_reward/std": 4.344130516052246, "rewards/kidney_reward/mean": 1.4954752922058105, "rewards/kidney_reward/std": 2.1614737510681152, "rewards/length2tails_reward/mean": 0.7488479614257812, "rewards/length2tails_reward/std": 0.29961255192756653, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.6496593952178955, "rewards/thermo_reward/std": 2.8214712142944336, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.40625, "completions/mean_terminated_length": 270.40625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.1007602158933878, "epoch": 0.54, "frac_reward_zero_std": 0.0, "grad_norm": 0.06450812518596649, "learning_rate": 1.960828527224165e-06, "loss": 0.0034, "num_tokens": 2339986.0, "reward": 10.680625915527344, "reward_std": 7.085577011108398, "rewards/fitness_reward/mean": 6.308061122894287, "rewards/fitness_reward/std": 3.326788902282715, "rewards/kidney_reward/mean": 1.973847508430481, "rewards/kidney_reward/std": 1.7045009136199951, "rewards/length2tails_reward/mean": 0.6022521257400513, "rewards/length2tails_reward/std": 0.36868545413017273, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.238492488861084, "rewards/thermo_reward/std": 2.4997081756591797, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 270.4375, "completions/mean_terminated_length": 270.4375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10899223946034908, "epoch": 0.542, "frac_reward_zero_std": 0.0, "grad_norm": 0.07492761313915253, "learning_rate": 1.9604723610310193e-06, "loss": -0.0014, "num_tokens": 2348672.0, "reward": 9.707115173339844, "reward_std": 6.792953014373779, "rewards/fitness_reward/mean": 6.241734504699707, "rewards/fitness_reward/std": 3.173224449157715, "rewards/kidney_reward/mean": 1.650956630706787, "rewards/kidney_reward/std": 1.7409296035766602, "rewards/length2tails_reward/mean": 0.6261404752731323, "rewards/length2tails_reward/std": 0.36519333720207214, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.6518101692199707, "rewards/thermo_reward/std": 2.6973798274993896, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.0, "completions/mean_terminated_length": 271.0, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10372630972415209, "epoch": 0.544, "frac_reward_zero_std": 0.0, "grad_norm": 0.04830276221036911, "learning_rate": 1.960114615582406e-06, "loss": -0.0031, "num_tokens": 2357376.0, "reward": 10.46242904663086, "reward_std": 6.330596446990967, "rewards/fitness_reward/mean": 6.598143100738525, "rewards/fitness_reward/std": 2.7803759574890137, "rewards/kidney_reward/mean": 1.8887929916381836, "rewards/kidney_reward/std": 1.5851860046386719, "rewards/length2tails_reward/mean": 0.6319226026535034, "rewards/length2tails_reward/std": 0.39739856123924255, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.812300205230713, "rewards/thermo_reward/std": 2.750009298324585, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.4375, "completions/mean_terminated_length": 271.4375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10615959856659174, "epoch": 0.546, "frac_reward_zero_std": 0.0, "grad_norm": 0.12400078773498535, "learning_rate": 1.959755291466548e-06, "loss": -0.0009, "num_tokens": 2366094.0, "reward": 9.208582878112793, "reward_std": 7.6028733253479, "rewards/fitness_reward/mean": 6.126691818237305, "rewards/fitness_reward/std": 3.529179096221924, "rewards/kidney_reward/mean": 1.646256446838379, "rewards/kidney_reward/std": 1.9871598482131958, "rewards/length2tails_reward/mean": 0.6565751433372498, "rewards/length2tails_reward/std": 0.37000566720962524, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.2699763774871826, "rewards/thermo_reward/std": 2.9453539848327637, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 264.5625, "completions/mean_terminated_length": 264.5625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.09374482091516256, "epoch": 0.548, "frac_reward_zero_std": 0.0, "grad_norm": 0.28227949142456055, "learning_rate": 1.959394389274264e-06, "loss": -0.0441, "num_tokens": 2374592.0, "reward": 6.746344566345215, "reward_std": 10.040284156799316, "rewards/fitness_reward/mean": 4.243337631225586, "rewards/fitness_reward/std": 5.081197261810303, "rewards/kidney_reward/mean": 1.1457996368408203, "rewards/kidney_reward/std": 2.2898268699645996, "rewards/length2tails_reward/mean": 0.6021537184715271, "rewards/length2tails_reward/std": 0.4084654748439789, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 1.2032418251037598, "rewards/thermo_reward/std": 2.993588924407959, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.0625, "completions/mean_terminated_length": 270.0625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08829090185463428, "epoch": 0.55, "frac_reward_zero_std": 0.0, "grad_norm": 0.07252117991447449, "learning_rate": 1.9590319095989656e-06, "loss": -0.0062, "num_tokens": 2383266.0, "reward": 7.876636505126953, "reward_std": 8.928849220275879, "rewards/fitness_reward/mean": 4.758971691131592, "rewards/fitness_reward/std": 4.770308494567871, "rewards/kidney_reward/mean": 1.543241262435913, "rewards/kidney_reward/std": 1.9612581729888916, "rewards/length2tails_reward/mean": 0.5653245449066162, "rewards/length2tails_reward/std": 0.38337770104408264, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.4178907871246338, "rewards/thermo_reward/std": 2.895482063293457, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 265.03125, "completions/mean_terminated_length": 265.03125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.11645239777863026, "epoch": 0.552, "frac_reward_zero_std": 0.0, "grad_norm": 0.374436616897583, "learning_rate": 1.9586678530366606e-06, "loss": -0.0846, "num_tokens": 2391779.0, "reward": 9.554816246032715, "reward_std": 6.905723571777344, "rewards/fitness_reward/mean": 6.215795516967773, "rewards/fitness_reward/std": 3.2936534881591797, "rewards/kidney_reward/mean": 1.817582130432129, "rewards/kidney_reward/std": 1.7202730178833008, "rewards/length2tails_reward/mean": 0.6768839359283447, "rewards/length2tails_reward/std": 0.3586890399456024, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.3537501096725464, "rewards/thermo_reward/std": 2.714689254760742, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.6875, "completions/mean_terminated_length": 271.6875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11658206302672625, "epoch": 0.554, "frac_reward_zero_std": 0.0, "grad_norm": 0.15221410989761353, "learning_rate": 1.9583022201859483e-06, "loss": -0.0003, "num_tokens": 2400505.0, "reward": 9.753368377685547, "reward_std": 6.594207286834717, "rewards/fitness_reward/mean": 6.377725124359131, "rewards/fitness_reward/std": 3.114637613296509, "rewards/kidney_reward/mean": 1.8250882625579834, "rewards/kidney_reward/std": 1.6681197881698608, "rewards/length2tails_reward/mean": 0.6662698984146118, "rewards/length2tails_reward/std": 0.3910340368747711, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.383927583694458, "rewards/thermo_reward/std": 2.8232386112213135, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.875, "completions/mean_terminated_length": 269.875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10668938141316175, "epoch": 0.556, "frac_reward_zero_std": 0.0, "grad_norm": 0.05785488709807396, "learning_rate": 1.9579350116480196e-06, "loss": 0.0003, "num_tokens": 2409173.0, "reward": 10.6722993850708, "reward_std": 6.572075366973877, "rewards/fitness_reward/mean": 6.304146766662598, "rewards/fitness_reward/std": 3.339707612991333, "rewards/kidney_reward/mean": 2.1426329612731934, "rewards/kidney_reward/std": 1.4806450605392456, "rewards/length2tails_reward/mean": 0.6038267612457275, "rewards/length2tails_reward/std": 0.3453625440597534, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.065136432647705, "rewards/thermo_reward/std": 2.26564884185791, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 270.9375, "completions/mean_terminated_length": 270.9375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10630720760673285, "epoch": 0.558, "frac_reward_zero_std": 0.0, "grad_norm": 0.449964702129364, "learning_rate": 1.9575662280266574e-06, "loss": 0.0062, "num_tokens": 2417875.0, "reward": 11.194704055786133, "reward_std": 5.337928771972656, "rewards/fitness_reward/mean": 6.6108551025390625, "rewards/fitness_reward/std": 2.733341932296753, "rewards/kidney_reward/mean": 2.1053543090820312, "rewards/kidney_reward/std": 1.3551630973815918, "rewards/length2tails_reward/mean": 0.6207993626594543, "rewards/length2tails_reward/std": 0.3310246765613556, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.3164145946502686, "rewards/thermo_reward/std": 2.2865805625915527, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.78125, "completions/mean_terminated_length": 270.78125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11214358266443014, "epoch": 0.56, "frac_reward_zero_std": 0.0, "grad_norm": 0.06810147315263748, "learning_rate": 1.957195869928234e-06, "loss": -0.0043, "num_tokens": 2426572.0, "reward": 10.33513355255127, "reward_std": 5.696887969970703, "rewards/fitness_reward/mean": 6.178849220275879, "rewards/fitness_reward/std": 3.1863362789154053, "rewards/kidney_reward/mean": 2.0156962871551514, "rewards/kidney_reward/std": 1.3245792388916016, "rewards/length2tails_reward/mean": 0.6156526803970337, "rewards/length2tails_reward/std": 0.37771496176719666, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.9790230989456177, "rewards/thermo_reward/std": 2.088127374649048, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.40625, "completions/mean_terminated_length": 271.40625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.1069456310942769, "epoch": 0.562, "frac_reward_zero_std": 0.0, "grad_norm": 0.08374135196208954, "learning_rate": 1.9568239379617085e-06, "loss": -0.0087, "num_tokens": 2435289.0, "reward": 11.6463623046875, "reward_std": 3.750762939453125, "rewards/fitness_reward/mean": 7.188657283782959, "rewards/fitness_reward/std": 0.5449937582015991, "rewards/kidney_reward/mean": 2.1691274642944336, "rewards/kidney_reward/std": 1.278011679649353, "rewards/length2tails_reward/mean": 0.6427605152130127, "rewards/length2tails_reward/std": 0.36037594079971313, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.1243011951446533, "rewards/thermo_reward/std": 2.355400800704956, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.3125, "completions/mean_terminated_length": 271.3125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.1080938633531332, "epoch": 0.564, "frac_reward_zero_std": 0.0, "grad_norm": 0.05745893344283104, "learning_rate": 1.9564504327386314e-06, "loss": -0.0014, "num_tokens": 2444003.0, "reward": 10.508401870727539, "reward_std": 6.477418899536133, "rewards/fitness_reward/mean": 6.332326889038086, "rewards/fitness_reward/std": 3.255727767944336, "rewards/kidney_reward/mean": 2.0130109786987305, "rewards/kidney_reward/std": 1.4932106733322144, "rewards/length2tails_reward/mean": 0.6512283086776733, "rewards/length2tails_reward/std": 0.3932029604911804, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.9979419708251953, "rewards/thermo_reward/std": 2.280850648880005, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.25, "completions/mean_terminated_length": 270.25, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.1069063525646925, "epoch": 0.566, "frac_reward_zero_std": 0.0, "grad_norm": 0.06927718967199326, "learning_rate": 1.956075354873137e-06, "loss": -0.0039, "num_tokens": 2452683.0, "reward": 9.580037117004395, "reward_std": 6.869331359863281, "rewards/fitness_reward/mean": 6.043339729309082, "rewards/fitness_reward/std": 3.5511674880981445, "rewards/kidney_reward/mean": 1.8608475923538208, "rewards/kidney_reward/std": 1.5245518684387207, "rewards/length2tails_reward/mean": 0.5623090863227844, "rewards/length2tails_reward/std": 0.3931002616882324, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 1.5258690118789673, "rewards/thermo_reward/std": 2.6764273643493652, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.46875, "completions/mean_terminated_length": 270.46875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10221463162451982, "epoch": 0.568, "frac_reward_zero_std": 0.0, "grad_norm": 0.10410986095666885, "learning_rate": 1.9556987049819476e-06, "loss": -0.0031, "num_tokens": 2461370.0, "reward": 8.779458999633789, "reward_std": 8.178945541381836, "rewards/fitness_reward/mean": 5.395164489746094, "rewards/fitness_reward/std": 4.234164714813232, "rewards/kidney_reward/mean": 1.618592381477356, "rewards/kidney_reward/std": 1.9761297702789307, "rewards/length2tails_reward/mean": 0.6146419048309326, "rewards/length2tails_reward/std": 0.38370072841644287, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.6042392253875732, "rewards/thermo_reward/std": 2.7575721740722656, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.9375, "completions/mean_terminated_length": 269.9375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10469623561948538, "epoch": 0.57, "frac_reward_zero_std": 0.0, "grad_norm": 0.11121048033237457, "learning_rate": 1.9553204836843688e-06, "loss": -0.0079, "num_tokens": 2470040.0, "reward": 11.497368812561035, "reward_std": 4.967898368835449, "rewards/fitness_reward/mean": 6.964468955993652, "rewards/fitness_reward/std": 1.9358503818511963, "rewards/kidney_reward/mean": 2.1352062225341797, "rewards/kidney_reward/std": 1.271851897239685, "rewards/length2tails_reward/mean": 0.529200553894043, "rewards/length2tails_reward/std": 0.424444317817688, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.2447729110717773, "rewards/thermo_reward/std": 2.451089382171631, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.71875, "completions/mean_terminated_length": 270.71875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11302467249333858, "epoch": 0.572, "frac_reward_zero_std": 0.0, "grad_norm": 0.0896771252155304, "learning_rate": 1.9549406916022904e-06, "loss": -0.0066, "num_tokens": 2478735.0, "reward": 10.75018310546875, "reward_std": 6.041914939880371, "rewards/fitness_reward/mean": 6.624394416809082, "rewards/fitness_reward/std": 2.682888984680176, "rewards/kidney_reward/mean": 2.046433448791504, "rewards/kidney_reward/std": 1.4409186840057373, "rewards/length2tails_reward/mean": 0.6032804250717163, "rewards/length2tails_reward/std": 0.40032944083213806, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.9190268516540527, "rewards/thermo_reward/std": 2.572425127029419, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 270.09375, "completions/mean_terminated_length": 270.09375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09249001834541559, "epoch": 0.574, "frac_reward_zero_std": 0.0, "grad_norm": 0.07904231548309326, "learning_rate": 1.954559329360185e-06, "loss": -0.0011, "num_tokens": 2487410.0, "reward": 9.763313293457031, "reward_std": 6.718929767608643, "rewards/fitness_reward/mean": 6.160808563232422, "rewards/fitness_reward/std": 3.4212520122528076, "rewards/kidney_reward/mean": 1.9048984050750732, "rewards/kidney_reward/std": 1.5606497526168823, "rewards/length2tails_reward/mean": 0.516953706741333, "rewards/length2tails_reward/std": 0.39459678530693054, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.5459113121032715, "rewards/thermo_reward/std": 2.662203788757324, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.5625, "completions/mean_terminated_length": 271.5625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10279795341193676, "epoch": 0.576, "frac_reward_zero_std": 0.0, "grad_norm": 0.07141412794589996, "learning_rate": 1.954176397585107e-06, "loss": -0.0029, "num_tokens": 2496132.0, "reward": 9.95760726928711, "reward_std": 6.590401649475098, "rewards/fitness_reward/mean": 6.230714321136475, "rewards/fitness_reward/std": 3.2086310386657715, "rewards/kidney_reward/mean": 1.7944406270980835, "rewards/kidney_reward/std": 1.552512288093567, "rewards/length2tails_reward/mean": 0.6762110590934753, "rewards/length2tails_reward/std": 0.3639642894268036, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.7648320198059082, "rewards/thermo_reward/std": 2.7508981227874756, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.15625, "completions/mean_terminated_length": 270.15625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09227543417364359, "epoch": 0.578, "frac_reward_zero_std": 0.0, "grad_norm": 0.08819398283958435, "learning_rate": 1.953791896906692e-06, "loss": -0.0006, "num_tokens": 2504809.0, "reward": 8.959890365600586, "reward_std": 6.8027191162109375, "rewards/fitness_reward/mean": 6.060881614685059, "rewards/fitness_reward/std": 3.4265811443328857, "rewards/kidney_reward/mean": 1.7525863647460938, "rewards/kidney_reward/std": 1.6423976421356201, "rewards/length2tails_reward/mean": 0.5825626850128174, "rewards/length2tails_reward/std": 0.36792027950286865, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.988166868686676, "rewards/thermo_reward/std": 2.7589187622070312, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.875, "completions/mean_terminated_length": 270.875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.1035487987101078, "epoch": 0.58, "frac_reward_zero_std": 0.0, "grad_norm": 0.060150545090436935, "learning_rate": 1.9534058279571543e-06, "loss": -0.0034, "num_tokens": 2513509.0, "reward": 11.565305709838867, "reward_std": 4.800674915313721, "rewards/fitness_reward/mean": 6.949488639831543, "rewards/fitness_reward/std": 2.019439697265625, "rewards/kidney_reward/mean": 2.3376946449279785, "rewards/kidney_reward/std": 1.153743028640747, "rewards/length2tails_reward/mean": 0.5994836688041687, "rewards/length2tails_reward/std": 0.3866693377494812, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.118173599243164, "rewards/thermo_reward/std": 2.375201463699341, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.6875, "completions/mean_terminated_length": 270.6875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10638982523232698, "epoch": 0.582, "frac_reward_zero_std": 0.0, "grad_norm": 0.09418369829654694, "learning_rate": 1.953018191371287e-06, "loss": -0.0008, "num_tokens": 2522203.0, "reward": 8.354490280151367, "reward_std": 8.561019897460938, "rewards/fitness_reward/mean": 5.269381523132324, "rewards/fitness_reward/std": 4.298111438751221, "rewards/kidney_reward/mean": 1.5763875246047974, "rewards/kidney_reward/std": 1.9843838214874268, "rewards/length2tails_reward/mean": 0.6417949199676514, "rewards/length2tails_reward/std": 0.36956968903541565, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 1.3507918119430542, "rewards/thermo_reward/std": 2.8522260189056396, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.25, "completions/mean_terminated_length": 270.25, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09572751075029373, "epoch": 0.584, "frac_reward_zero_std": 0.0, "grad_norm": 0.11383271962404251, "learning_rate": 1.9526289877864616e-06, "loss": -0.0036, "num_tokens": 2530883.0, "reward": 8.888689994812012, "reward_std": 7.696630477905273, "rewards/fitness_reward/mean": 5.718735694885254, "rewards/fitness_reward/std": 3.7902793884277344, "rewards/kidney_reward/mean": 1.564140796661377, "rewards/kidney_reward/std": 1.957545518875122, "rewards/length2tails_reward/mean": 0.5904607176780701, "rewards/length2tails_reward/std": 0.3688611686229706, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.446768045425415, "rewards/thermo_reward/std": 3.0040109157562256, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.21875, "completions/mean_terminated_length": 270.21875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.11232748720794916, "epoch": 0.586, "frac_reward_zero_std": 0.0, "grad_norm": 0.08226846903562546, "learning_rate": 1.9522382178426256e-06, "loss": -0.0042, "num_tokens": 2539562.0, "reward": 10.394163131713867, "reward_std": 6.431159973144531, "rewards/fitness_reward/mean": 6.652461528778076, "rewards/fitness_reward/std": 2.7887954711914062, "rewards/kidney_reward/mean": 1.8936915397644043, "rewards/kidney_reward/std": 1.7695732116699219, "rewards/length2tails_reward/mean": 0.5431311130523682, "rewards/length2tails_reward/std": 0.41602662205696106, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.6936962604522705, "rewards/thermo_reward/std": 2.7570669651031494, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.21875, "completions/mean_terminated_length": 272.21875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10617440287023783, "epoch": 0.588, "frac_reward_zero_std": 0.0, "grad_norm": 0.1224871352314949, "learning_rate": 1.9518458821823017e-06, "loss": -0.0029, "num_tokens": 2548305.0, "reward": 12.160886764526367, "reward_std": 3.0531609058380127, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.650642991065979, "rewards/kidney_reward/mean": 2.3008062839508057, "rewards/kidney_reward/std": 0.9739616513252258, "rewards/length2tails_reward/mean": 0.7086777687072754, "rewards/length2tails_reward/std": 0.36278241872787476, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.4430453777313232, "rewards/thermo_reward/std": 1.9213484525680542, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.53125, "completions/mean_terminated_length": 270.53125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.09158930648118258, "epoch": 0.59, "frac_reward_zero_std": 0.0, "grad_norm": 0.06286156922578812, "learning_rate": 1.9514519814505873e-06, "loss": -0.0031, "num_tokens": 2556994.0, "reward": 10.294597625732422, "reward_std": 6.84071159362793, "rewards/fitness_reward/mean": 6.325943470001221, "rewards/fitness_reward/std": 3.271643877029419, "rewards/kidney_reward/mean": 1.911651611328125, "rewards/kidney_reward/std": 1.6634387969970703, "rewards/length2tails_reward/mean": 0.562515139579773, "rewards/length2tails_reward/std": 0.42633259296417236, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.9007506370544434, "rewards/thermo_reward/std": 2.5632848739624023, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 268.90625, "completions/mean_terminated_length": 268.90625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09582995343953371, "epoch": 0.592, "frac_reward_zero_std": 0.0, "grad_norm": 0.06275047361850739, "learning_rate": 1.9510565162951534e-06, "loss": -0.0034, "num_tokens": 2565631.0, "reward": 10.032028198242188, "reward_std": 6.264020919799805, "rewards/fitness_reward/mean": 6.083130836486816, "rewards/fitness_reward/std": 3.4429848194122314, "rewards/kidney_reward/mean": 1.9887399673461914, "rewards/kidney_reward/std": 1.3645800352096558, "rewards/length2tails_reward/mean": 0.47472113370895386, "rewards/length2tails_reward/std": 0.4072186350822449, "rewards/repeated_in_batch_reward/mean": 0.90625, "rewards/repeated_in_batch_reward/std": 0.2961445748806, "rewards/thermo_reward/mean": 1.822059988975525, "rewards/thermo_reward/std": 2.248640775680542, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.03125, "completions/mean_terminated_length": 272.03125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11088305059820414, "epoch": 0.594, "frac_reward_zero_std": 0.0, "grad_norm": 0.08249466121196747, "learning_rate": 1.9506594873662434e-06, "loss": -0.0015, "num_tokens": 2574368.0, "reward": 11.153301239013672, "reward_std": 6.116630554199219, "rewards/fitness_reward/mean": 6.5505523681640625, "rewards/fitness_reward/std": 2.78484845161438, "rewards/kidney_reward/mean": 2.10675311088562, "rewards/kidney_reward/std": 1.4828115701675415, "rewards/length2tails_reward/mean": 0.6954585313796997, "rewards/length2tails_reward/std": 0.37617215514183044, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.3264503479003906, "rewards/thermo_reward/std": 2.2143211364746094, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.59375, "completions/mean_terminated_length": 270.59375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.1028360053896904, "epoch": 0.596, "frac_reward_zero_std": 0.0, "grad_norm": 0.05633142590522766, "learning_rate": 1.950260895316671e-06, "loss": -0.0029, "num_tokens": 2583059.0, "reward": 10.895092010498047, "reward_std": 5.3882575035095215, "rewards/fitness_reward/mean": 6.57216215133667, "rewards/fitness_reward/std": 2.8825032711029053, "rewards/kidney_reward/mean": 2.1237099170684814, "rewards/kidney_reward/std": 1.1620488166809082, "rewards/length2tails_reward/mean": 0.6051574945449829, "rewards/length2tails_reward/std": 0.3702513575553894, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.0387041568756104, "rewards/thermo_reward/std": 2.117142677307129, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.40625, "completions/mean_terminated_length": 271.40625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10438697133213282, "epoch": 0.598, "frac_reward_zero_std": 0.0, "grad_norm": 0.26576927304267883, "learning_rate": 1.94986074080182e-06, "loss": -0.0065, "num_tokens": 2591776.0, "reward": 10.72160530090332, "reward_std": 6.307686805725098, "rewards/fitness_reward/mean": 6.244356155395508, "rewards/fitness_reward/std": 3.3582611083984375, "rewards/kidney_reward/mean": 1.9709126949310303, "rewards/kidney_reward/std": 1.461074709892273, "rewards/length2tails_reward/mean": 0.6270080804824829, "rewards/length2tails_reward/std": 0.3762587010860443, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.343636989593506, "rewards/thermo_reward/std": 2.063184976577759, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.0625, "completions/mean_terminated_length": 271.0625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.1024255147203803, "epoch": 0.6, "frac_reward_zero_std": 0.0, "grad_norm": 0.0735434740781784, "learning_rate": 1.9494590244796457e-06, "loss": -0.001, "num_tokens": 2600482.0, "reward": 10.59252643585205, "reward_std": 6.816814422607422, "rewards/fitness_reward/mean": 6.158326625823975, "rewards/fitness_reward/std": 3.2900969982147217, "rewards/kidney_reward/mean": 1.8844225406646729, "rewards/kidney_reward/std": 1.6350239515304565, "rewards/length2tails_reward/mean": 0.6254887580871582, "rewards/length2tails_reward/std": 0.36850452423095703, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.3872289657592773, "rewards/thermo_reward/std": 2.259819746017456, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 269.3125, "completions/mean_terminated_length": 269.3125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09671336691826582, "epoch": 0.602, "frac_reward_zero_std": 0.0, "grad_norm": 0.06916102021932602, "learning_rate": 1.9490557470106686e-06, "loss": -0.0066, "num_tokens": 2609132.0, "reward": 11.148724555969238, "reward_std": 3.967028856277466, "rewards/fitness_reward/mean": 6.99554443359375, "rewards/fitness_reward/std": 1.7628074884414673, "rewards/kidney_reward/mean": 2.1857385635375977, "rewards/kidney_reward/std": 0.9768639206886292, "rewards/length2tails_reward/mean": 0.5360361933708191, "rewards/length2tails_reward/std": 0.3558644950389862, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.8138371706008911, "rewards/thermo_reward/std": 2.337616443634033, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.96875, "completions/mean_terminated_length": 269.96875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11073332466185093, "epoch": 0.604, "frac_reward_zero_std": 0.0, "grad_norm": 0.1010536327958107, "learning_rate": 1.9486509090579775e-06, "loss": 0.0004, "num_tokens": 2617803.0, "reward": 10.049813270568848, "reward_std": 7.127676010131836, "rewards/fitness_reward/mean": 6.250378608703613, "rewards/fitness_reward/std": 3.3384571075439453, "rewards/kidney_reward/mean": 1.8567582368850708, "rewards/kidney_reward/std": 1.7774019241333008, "rewards/length2tails_reward/mean": 0.5704126358032227, "rewards/length2tails_reward/std": 0.3750966191291809, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.7856345176696777, "rewards/thermo_reward/std": 2.6759543418884277, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 270.6875, "completions/mean_terminated_length": 270.6875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.10591023601591587, "epoch": 0.606, "frac_reward_zero_std": 0.0, "grad_norm": 0.09401101619005203, "learning_rate": 1.948244511287226e-06, "loss": -0.0038, "num_tokens": 2626497.0, "reward": 12.200855255126953, "reward_std": 2.750121831893921, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.3575549125671387, "rewards/kidney_reward/std": 0.6605173945426941, "rewards/length2tails_reward/mean": 0.6274683475494385, "rewards/length2tails_reward/std": 0.32595279812812805, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.3768768310546875, "rewards/thermo_reward/std": 2.088259220123291, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.10753176920115948, "epoch": 0.608, "frac_reward_zero_std": 0.0, "grad_norm": 0.05684461444616318, "learning_rate": 1.9478365543666344e-06, "loss": 0.0025, "num_tokens": 2635213.0, "reward": 11.500574111938477, "reward_std": 6.049794673919678, "rewards/fitness_reward/mean": 6.65021276473999, "rewards/fitness_reward/std": 2.7978789806365967, "rewards/kidney_reward/mean": 2.128681182861328, "rewards/kidney_reward/std": 1.4498189687728882, "rewards/length2tails_reward/mean": 0.7100331783294678, "rewards/length2tails_reward/std": 0.2788810729980469, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.5506768226623535, "rewards/thermo_reward/std": 2.18919038772583, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.6875, "completions/mean_terminated_length": 270.6875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10265343729406595, "epoch": 0.61, "frac_reward_zero_std": 0.0, "grad_norm": 0.07681909203529358, "learning_rate": 1.947427038966984e-06, "loss": 0.0004, "num_tokens": 2643907.0, "reward": 11.134979248046875, "reward_std": 6.3709282875061035, "rewards/fitness_reward/mean": 6.6581268310546875, "rewards/fitness_reward/std": 2.766512632369995, "rewards/kidney_reward/mean": 2.0494179725646973, "rewards/kidney_reward/std": 1.6289461851119995, "rewards/length2tails_reward/mean": 0.6452488899230957, "rewards/length2tails_reward/std": 0.35534244775772095, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.262909412384033, "rewards/thermo_reward/std": 2.480961799621582, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.0625, "completions/mean_terminated_length": 271.0625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10664802324026823, "epoch": 0.612, "frac_reward_zero_std": 0.0, "grad_norm": 0.06387440115213394, "learning_rate": 1.947015965761621e-06, "loss": 0.0015, "num_tokens": 2652613.0, "reward": 9.361135482788086, "reward_std": 7.873195648193359, "rewards/fitness_reward/mean": 5.945728302001953, "rewards/fitness_reward/std": 3.80493426322937, "rewards/kidney_reward/mean": 1.7331231832504272, "rewards/kidney_reward/std": 1.8674038648605347, "rewards/length2tails_reward/mean": 0.6784215569496155, "rewards/length2tails_reward/std": 0.3497619032859802, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.5144407749176025, "rewards/thermo_reward/std": 2.8010506629943848, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.59375, "completions/mean_terminated_length": 270.59375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.1120320213958621, "epoch": 0.614, "frac_reward_zero_std": 0.0, "grad_norm": 0.15682992339134216, "learning_rate": 1.946603335426452e-06, "loss": 0.0017, "num_tokens": 2661304.0, "reward": 10.289670944213867, "reward_std": 5.526130676269531, "rewards/fitness_reward/mean": 6.585597038269043, "rewards/fitness_reward/std": 2.614732027053833, "rewards/kidney_reward/mean": 2.012620449066162, "rewards/kidney_reward/std": 1.3148826360702515, "rewards/length2tails_reward/mean": 0.6058496832847595, "rewards/length2tails_reward/std": 0.3908247947692871, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.5308687686920166, "rewards/thermo_reward/std": 2.6803879737854004, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 271.25, "completions/mean_terminated_length": 271.25, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11361629422754049, "epoch": 0.616, "frac_reward_zero_std": 0.0, "grad_norm": 0.11039303243160248, "learning_rate": 1.946189148639943e-06, "loss": -0.0011, "num_tokens": 2670016.0, "reward": 10.747947692871094, "reward_std": 6.47308874130249, "rewards/fitness_reward/mean": 6.574423789978027, "rewards/fitness_reward/std": 2.8734307289123535, "rewards/kidney_reward/mean": 2.028642177581787, "rewards/kidney_reward/std": 1.6022999286651611, "rewards/length2tails_reward/mean": 0.6725289821624756, "rewards/length2tails_reward/std": 0.3522748649120331, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.977628469467163, "rewards/thermo_reward/std": 2.5821573734283447, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.5625, "completions/mean_terminated_length": 271.5625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10842048842459917, "epoch": 0.618, "frac_reward_zero_std": 0.0, "grad_norm": 0.07693401724100113, "learning_rate": 1.94577340608312e-06, "loss": 0.001, "num_tokens": 2678738.0, "reward": 9.216243743896484, "reward_std": 8.630293846130371, "rewards/fitness_reward/mean": 5.8715739250183105, "rewards/fitness_reward/std": 4.0133185386657715, "rewards/kidney_reward/mean": 1.5296517610549927, "rewards/kidney_reward/std": 2.1336543560028076, "rewards/length2tails_reward/mean": 0.7188667058944702, "rewards/length2tails_reward/std": 0.3229285180568695, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.6431313753128052, "rewards/thermo_reward/std": 2.9635863304138184, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.21875, "completions/mean_terminated_length": 269.21875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10129899997264147, "epoch": 0.62, "frac_reward_zero_std": 0.0, "grad_norm": 0.09164420515298843, "learning_rate": 1.9453561084395687e-06, "loss": -0.0067, "num_tokens": 2687385.0, "reward": 10.077606201171875, "reward_std": 4.365948677062988, "rewards/fitness_reward/mean": 6.684309482574463, "rewards/fitness_reward/std": 2.4438283443450928, "rewards/kidney_reward/mean": 2.011162281036377, "rewards/kidney_reward/std": 1.0672303438186646, "rewards/length2tails_reward/mean": 0.4423019289970398, "rewards/length2tails_reward/std": 0.39656853675842285, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.2379043102264404, "rewards/thermo_reward/std": 2.4710021018981934, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.25, "completions/mean_terminated_length": 271.25, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10645945090800524, "epoch": 0.622, "frac_reward_zero_std": 0.0, "grad_norm": 0.12628157436847687, "learning_rate": 1.944937256395429e-06, "loss": -0.006, "num_tokens": 2696097.0, "reward": 7.2259135246276855, "reward_std": 7.998942852020264, "rewards/fitness_reward/mean": 5.124382019042969, "rewards/fitness_reward/std": 4.213342666625977, "rewards/kidney_reward/mean": 1.2453505992889404, "rewards/kidney_reward/std": 2.059305429458618, "rewards/length2tails_reward/mean": 0.6736060380935669, "rewards/length2tails_reward/std": 0.37152162194252014, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.6888208389282227, "rewards/thermo_reward/std": 2.8730380535125732, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.71875, "completions/mean_terminated_length": 272.71875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10902046225965023, "epoch": 0.624, "frac_reward_zero_std": 0.0, "grad_norm": 0.08679524809122086, "learning_rate": 1.9445168506393986e-06, "loss": 0.0039, "num_tokens": 2704856.0, "reward": 11.37441349029541, "reward_std": 5.79127311706543, "rewards/fitness_reward/mean": 6.695094585418701, "rewards/fitness_reward/std": 2.622786521911621, "rewards/kidney_reward/mean": 2.197542667388916, "rewards/kidney_reward/std": 1.2092877626419067, "rewards/length2tails_reward/mean": 0.7713974118232727, "rewards/length2tails_reward/std": 0.2953493297100067, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.3046369552612305, "rewards/thermo_reward/std": 2.519876003265381, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 269.59375, "completions/mean_terminated_length": 269.59375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09812906850129366, "epoch": 0.626, "frac_reward_zero_std": 0.0, "grad_norm": 0.07246968150138855, "learning_rate": 1.944094891862728e-06, "loss": -0.0018, "num_tokens": 2713515.0, "reward": 10.279682159423828, "reward_std": 6.380427360534668, "rewards/fitness_reward/mean": 6.306829452514648, "rewards/fitness_reward/std": 3.1515753269195557, "rewards/kidney_reward/mean": 1.8959795236587524, "rewards/kidney_reward/std": 1.5268750190734863, "rewards/length2tails_reward/mean": 0.5150830745697021, "rewards/length2tails_reward/std": 0.39167141914367676, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.9253649711608887, "rewards/thermo_reward/std": 2.3617825508117676, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.96875, "completions/mean_terminated_length": 269.96875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09926465712487698, "epoch": 0.628, "frac_reward_zero_std": 0.0, "grad_norm": 0.06662730872631073, "learning_rate": 1.9436713807592232e-06, "loss": 0.0017, "num_tokens": 2722186.0, "reward": 10.776936531066895, "reward_std": 6.38076639175415, "rewards/fitness_reward/mean": 6.596822738647461, "rewards/fitness_reward/std": 2.7853715419769287, "rewards/kidney_reward/mean": 1.8975119590759277, "rewards/kidney_reward/std": 1.7479808330535889, "rewards/length2tails_reward/mean": 0.6078963279724121, "rewards/length2tails_reward/std": 0.3126949965953827, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.121812582015991, "rewards/thermo_reward/std": 2.486586332321167, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.59375, "completions/mean_terminated_length": 271.59375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10481654526665807, "epoch": 0.63, "frac_reward_zero_std": 0.0, "grad_norm": 0.08617524802684784, "learning_rate": 1.943246318025242e-06, "loss": -0.0013, "num_tokens": 2730909.0, "reward": 11.040285110473633, "reward_std": 5.796336650848389, "rewards/fitness_reward/mean": 6.635307312011719, "rewards/fitness_reward/std": 2.6435258388519287, "rewards/kidney_reward/mean": 1.8447988033294678, "rewards/kidney_reward/std": 1.6061389446258545, "rewards/length2tails_reward/mean": 0.6512874364852905, "rewards/length2tails_reward/std": 0.38937750458717346, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.395050525665283, "rewards/thermo_reward/std": 2.299553155899048, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.5, "completions/mean_terminated_length": 271.5, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11112393904477358, "epoch": 0.632, "frac_reward_zero_std": 0.0, "grad_norm": 0.10392550379037857, "learning_rate": 1.9428197043596927e-06, "loss": -0.0003, "num_tokens": 2739629.0, "reward": 10.80946159362793, "reward_std": 6.718322277069092, "rewards/fitness_reward/mean": 6.284722805023193, "rewards/fitness_reward/std": 3.402113437652588, "rewards/kidney_reward/mean": 2.026055335998535, "rewards/kidney_reward/std": 1.583153247833252, "rewards/length2tails_reward/mean": 0.6818006038665771, "rewards/length2tails_reward/std": 0.33342015743255615, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.330503463745117, "rewards/thermo_reward/std": 2.639218807220459, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.46875, "completions/mean_terminated_length": 271.46875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10043955408036709, "epoch": 0.634, "frac_reward_zero_std": 0.0, "grad_norm": 0.07863548398017883, "learning_rate": 1.9423915404640348e-06, "loss": -0.008, "num_tokens": 2748348.0, "reward": 11.660589218139648, "reward_std": 4.825669765472412, "rewards/fitness_reward/mean": 6.682684898376465, "rewards/fitness_reward/std": 2.681821346282959, "rewards/kidney_reward/mean": 2.245955467224121, "rewards/kidney_reward/std": 1.1798933744430542, "rewards/length2tails_reward/mean": 0.635576605796814, "rewards/length2tails_reward/std": 0.39163753390312195, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.5683908462524414, "rewards/thermo_reward/std": 2.1094815731048584, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.53125, "completions/mean_terminated_length": 271.53125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11370659247040749, "epoch": 0.636, "frac_reward_zero_std": 0.0, "grad_norm": 0.08406565338373184, "learning_rate": 1.9419618270422753e-06, "loss": -0.0037, "num_tokens": 2757069.0, "reward": 9.857394218444824, "reward_std": 6.708688735961914, "rewards/fitness_reward/mean": 6.285477638244629, "rewards/fitness_reward/std": 3.2216105461120605, "rewards/kidney_reward/mean": 1.8642642498016357, "rewards/kidney_reward/std": 1.6696314811706543, "rewards/length2tails_reward/mean": 0.7178899645805359, "rewards/length2tails_reward/std": 0.3267357647418976, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.535862922668457, "rewards/thermo_reward/std": 2.7389419078826904, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.78125, "completions/mean_terminated_length": 271.78125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10528519842773676, "epoch": 0.638, "frac_reward_zero_std": 0.0, "grad_norm": 0.14078086614608765, "learning_rate": 1.9415305648009716e-06, "loss": -0.003, "num_tokens": 2765798.0, "reward": 11.718783378601074, "reward_std": 4.424414157867432, "rewards/fitness_reward/mean": 6.930644512176514, "rewards/fitness_reward/std": 2.1247165203094482, "rewards/kidney_reward/mean": 2.121047019958496, "rewards/kidney_reward/std": 1.3946641683578491, "rewards/length2tails_reward/mean": 0.6864926815032959, "rewards/length2tails_reward/std": 0.35047489404678345, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.4984426498413086, "rewards/thermo_reward/std": 2.34663724899292, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 270.25, "completions/mean_terminated_length": 270.25, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.12153899669647217, "epoch": 0.64, "frac_reward_zero_std": 0.0, "grad_norm": 0.8654022812843323, "learning_rate": 1.9410977544492244e-06, "loss": -0.003, "num_tokens": 2774478.0, "reward": 9.140295028686523, "reward_std": 6.056877136230469, "rewards/fitness_reward/mean": 6.590029716491699, "rewards/fitness_reward/std": 2.633514881134033, "rewards/kidney_reward/mean": 1.5470025539398193, "rewards/kidney_reward/std": 1.5915534496307373, "rewards/length2tails_reward/mean": 0.5990718603134155, "rewards/length2tails_reward/std": 0.32859253883361816, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 0.8433558344841003, "rewards/thermo_reward/std": 2.8439509868621826, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.375, "completions/mean_terminated_length": 269.375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10163491126149893, "epoch": 0.642, "frac_reward_zero_std": 0.0, "grad_norm": 0.1465456634759903, "learning_rate": 1.9406633966986824e-06, "loss": 0.0035, "num_tokens": 2783130.0, "reward": 10.073744773864746, "reward_std": 6.786154270172119, "rewards/fitness_reward/mean": 6.615808486938477, "rewards/fitness_reward/std": 2.9360740184783936, "rewards/kidney_reward/mean": 1.804457426071167, "rewards/kidney_reward/std": 1.8130881786346436, "rewards/length2tails_reward/mean": 0.5035993456840515, "rewards/length2tails_reward/std": 0.4101681113243103, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.5031180381774902, "rewards/thermo_reward/std": 2.811406135559082, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.46875, "completions/mean_terminated_length": 270.46875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11280594673007727, "epoch": 0.644, "frac_reward_zero_std": 0.0, "grad_norm": 0.06963612139225006, "learning_rate": 1.9402274922635376e-06, "loss": -0.0029, "num_tokens": 2791817.0, "reward": 10.2340669631958, "reward_std": 6.397792816162109, "rewards/fitness_reward/mean": 6.016045570373535, "rewards/fitness_reward/std": 3.631082773208618, "rewards/kidney_reward/mean": 2.075089931488037, "rewards/kidney_reward/std": 1.2228959798812866, "rewards/length2tails_reward/mean": 0.5873444080352783, "rewards/length2tails_reward/std": 0.3563327491283417, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 1.99044668674469, "rewards/thermo_reward/std": 2.3727447986602783, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.75, "completions/mean_terminated_length": 270.75, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09691578336060047, "epoch": 0.646, "frac_reward_zero_std": 0.0, "grad_norm": 0.2780953049659729, "learning_rate": 1.9397900418605256e-06, "loss": -0.0086, "num_tokens": 2800513.0, "reward": 9.048253059387207, "reward_std": 7.027248382568359, "rewards/fitness_reward/mean": 6.214809894561768, "rewards/fitness_reward/std": 3.2641353607177734, "rewards/kidney_reward/mean": 1.5307283401489258, "rewards/kidney_reward/std": 1.8468669652938843, "rewards/length2tails_reward/mean": 0.5750716328620911, "rewards/length2tails_reward/std": 0.3983921408653259, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.1452076435089111, "rewards/thermo_reward/std": 2.8743062019348145, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.1875, "completions/mean_terminated_length": 272.1875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11906621418893337, "epoch": 0.648, "frac_reward_zero_std": 0.0, "grad_norm": 0.06410878151655197, "learning_rate": 1.9393510462089237e-06, "loss": -0.002, "num_tokens": 2809255.0, "reward": 11.604827880859375, "reward_std": 5.0774712562561035, "rewards/fitness_reward/mean": 6.653090953826904, "rewards/fitness_reward/std": 2.569222927093506, "rewards/kidney_reward/mean": 2.1688947677612305, "rewards/kidney_reward/std": 1.2170603275299072, "rewards/length2tails_reward/mean": 0.7435036301612854, "rewards/length2tails_reward/std": 0.298550546169281, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.608492374420166, "rewards/thermo_reward/std": 1.959001898765564, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.90625, "completions/mean_terminated_length": 270.90625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10088578891009092, "epoch": 0.65, "frac_reward_zero_std": 0.0, "grad_norm": 0.15146462619304657, "learning_rate": 1.938910506030549e-06, "loss": -0.0078, "num_tokens": 2817956.0, "reward": 11.327554702758789, "reward_std": 4.425415992736816, "rewards/fitness_reward/mean": 6.687413215637207, "rewards/fitness_reward/std": 2.4316930770874023, "rewards/kidney_reward/mean": 2.190837860107422, "rewards/kidney_reward/std": 0.8751612305641174, "rewards/length2tails_reward/mean": 0.5867102146148682, "rewards/length2tails_reward/std": 0.39029496908187866, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 2.2968828678131104, "rewards/thermo_reward/std": 1.9458266496658325, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.71875, "completions/mean_terminated_length": 271.71875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.11654944997280836, "epoch": 0.652, "frac_reward_zero_std": 0.0, "grad_norm": 0.09489252418279648, "learning_rate": 1.9384684220497604e-06, "loss": -0.0039, "num_tokens": 2826683.0, "reward": 10.793950080871582, "reward_std": 6.374341011047363, "rewards/fitness_reward/mean": 6.582357406616211, "rewards/fitness_reward/std": 2.842876434326172, "rewards/kidney_reward/mean": 1.9244379997253418, "rewards/kidney_reward/std": 1.603426218032837, "rewards/length2tails_reward/mean": 0.7159044742584229, "rewards/length2tails_reward/std": 0.3271859288215637, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.1155638694763184, "rewards/thermo_reward/std": 2.4932732582092285, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 270.65625, "completions/mean_terminated_length": 270.65625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10629504360258579, "epoch": 0.654, "frac_reward_zero_std": 0.0, "grad_norm": 0.38704410195350647, "learning_rate": 1.938024794993453e-06, "loss": 0.0021, "num_tokens": 2835376.0, "reward": 9.949481964111328, "reward_std": 7.517545223236084, "rewards/fitness_reward/mean": 6.169020652770996, "rewards/fitness_reward/std": 3.3954830169677734, "rewards/kidney_reward/mean": 1.7824015617370605, "rewards/kidney_reward/std": 2.01423978805542, "rewards/length2tails_reward/mean": 0.6802841424942017, "rewards/length2tails_reward/std": 0.3347759246826172, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.8300318717956543, "rewards/thermo_reward/std": 2.6380269527435303, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.46875, "completions/mean_terminated_length": 271.46875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.11221757438033819, "epoch": 0.656, "frac_reward_zero_std": 0.0, "grad_norm": 0.1447339504957199, "learning_rate": 1.9375796255910604e-06, "loss": 0.0012, "num_tokens": 2844095.0, "reward": 12.190248489379883, "reward_std": 3.808412790298462, "rewards/fitness_reward/mean": 7.046268939971924, "rewards/fitness_reward/std": 1.7814339399337769, "rewards/kidney_reward/mean": 2.4317309856414795, "rewards/kidney_reward/std": 0.811180591583252, "rewards/length2tails_reward/mean": 0.7027704119682312, "rewards/length2tails_reward/std": 0.29092442989349365, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.5419716835021973, "rewards/thermo_reward/std": 1.7265745401382446, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11657298356294632, "epoch": 0.658, "frac_reward_zero_std": 0.0, "grad_norm": 1.7877811193466187, "learning_rate": 1.937132914574552e-06, "loss": -0.001, "num_tokens": 2852811.0, "reward": 9.436601638793945, "reward_std": 7.611492156982422, "rewards/fitness_reward/mean": 6.449830532073975, "rewards/fitness_reward/std": 3.1732778549194336, "rewards/kidney_reward/mean": 1.7130616903305054, "rewards/kidney_reward/std": 2.137336492538452, "rewards/length2tails_reward/mean": 0.6293182373046875, "rewards/length2tails_reward/std": 0.3911236524581909, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.1107772588729858, "rewards/thermo_reward/std": 3.128592014312744, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.53125, "completions/mean_terminated_length": 271.53125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11583487037569284, "epoch": 0.66, "frac_reward_zero_std": 0.0, "grad_norm": 0.13773338496685028, "learning_rate": 1.936684662678432e-06, "loss": -0.0028, "num_tokens": 2861532.0, "reward": 10.950634956359863, "reward_std": 5.712095260620117, "rewards/fitness_reward/mean": 6.851317405700684, "rewards/fitness_reward/std": 2.2583343982696533, "rewards/kidney_reward/mean": 1.919154167175293, "rewards/kidney_reward/std": 1.6229445934295654, "rewards/length2tails_reward/mean": 0.6816023588180542, "rewards/length2tails_reward/std": 0.33754870295524597, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.012002944946289, "rewards/thermo_reward/std": 2.5088531970977783, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.6875, "completions/mean_terminated_length": 270.6875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10768833290785551, "epoch": 0.662, "frac_reward_zero_std": 0.0, "grad_norm": 0.2022908627986908, "learning_rate": 1.936234870639737e-06, "loss": 0.0017, "num_tokens": 2870226.0, "reward": 8.697061538696289, "reward_std": 8.931398391723633, "rewards/fitness_reward/mean": 5.170741081237793, "rewards/fitness_reward/std": 4.65004301071167, "rewards/kidney_reward/mean": 1.6327917575836182, "rewards/kidney_reward/std": 2.033492088317871, "rewards/length2tails_reward/mean": 0.6267710328102112, "rewards/length2tails_reward/std": 0.4012013375759125, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.7308508157730103, "rewards/thermo_reward/std": 2.9484026432037354, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.8125, "completions/mean_terminated_length": 269.8125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10277679562568665, "epoch": 0.664, "frac_reward_zero_std": 0.0, "grad_norm": 0.09230952709913254, "learning_rate": 1.935783539198038e-06, "loss": -0.0007, "num_tokens": 2878892.0, "reward": 9.331253051757812, "reward_std": 6.823907852172852, "rewards/fitness_reward/mean": 6.272053241729736, "rewards/fitness_reward/std": 3.266080617904663, "rewards/kidney_reward/mean": 1.7818467617034912, "rewards/kidney_reward/std": 1.7117173671722412, "rewards/length2tails_reward/mean": 0.5891803503036499, "rewards/length2tails_reward/std": 0.3721315562725067, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.118435263633728, "rewards/thermo_reward/std": 2.897606134414673, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.90625, "completions/mean_terminated_length": 271.90625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11460565775632858, "epoch": 0.666, "frac_reward_zero_std": 0.0, "grad_norm": 0.09108323603868484, "learning_rate": 1.9353306690954357e-06, "loss": -0.0014, "num_tokens": 2887625.0, "reward": 9.689314842224121, "reward_std": 7.377963542938232, "rewards/fitness_reward/mean": 6.1329803466796875, "rewards/fitness_reward/std": 3.3433401584625244, "rewards/kidney_reward/mean": 1.7278858423233032, "rewards/kidney_reward/std": 1.962307095527649, "rewards/length2tails_reward/mean": 0.7054315805435181, "rewards/length2tails_reward/std": 0.32727426290512085, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.6579058170318604, "rewards/thermo_reward/std": 2.702836751937866, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.96875, "completions/mean_terminated_length": 270.96875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.1053895577788353, "epoch": 0.668, "frac_reward_zero_std": 0.0, "grad_norm": 0.08655820786952972, "learning_rate": 1.9348762610765613e-06, "loss": -0.0008, "num_tokens": 2896328.0, "reward": 10.018129348754883, "reward_std": 6.529927730560303, "rewards/fitness_reward/mean": 6.52497673034668, "rewards/fitness_reward/std": 2.844560384750366, "rewards/kidney_reward/mean": 1.8537702560424805, "rewards/kidney_reward/std": 1.7908141613006592, "rewards/length2tails_reward/mean": 0.6363632678985596, "rewards/length2tails_reward/std": 0.3371298611164093, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.4757461547851562, "rewards/thermo_reward/std": 2.5623059272766113, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.875, "completions/mean_terminated_length": 270.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11338281631469727, "epoch": 0.67, "frac_reward_zero_std": 0.0, "grad_norm": 0.0765661969780922, "learning_rate": 1.934420315888575e-06, "loss": 0.0009, "num_tokens": 2905028.0, "reward": 11.437816619873047, "reward_std": 5.1707940101623535, "rewards/fitness_reward/mean": 6.533980369567871, "rewards/fitness_reward/std": 2.809199571609497, "rewards/kidney_reward/mean": 2.2249608039855957, "rewards/kidney_reward/std": 1.1669909954071045, "rewards/length2tails_reward/mean": 0.5938228964805603, "rewards/length2tails_reward/std": 0.3681739270687103, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.5194928646087646, "rewards/thermo_reward/std": 2.083427667617798, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 269.4375, "completions/mean_terminated_length": 269.4375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.1042593028396368, "epoch": 0.672, "frac_reward_zero_std": 0.0, "grad_norm": 0.13140137493610382, "learning_rate": 1.933962834281163e-06, "loss": 0.0004, "num_tokens": 2913682.0, "reward": 10.98188304901123, "reward_std": 6.552918910980225, "rewards/fitness_reward/mean": 6.586332321166992, "rewards/fitness_reward/std": 2.8265011310577393, "rewards/kidney_reward/mean": 1.8140132427215576, "rewards/kidney_reward/std": 1.8791821002960205, "rewards/length2tails_reward/mean": 0.5543262958526611, "rewards/length2tails_reward/std": 0.3463633358478546, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.4261040687561035, "rewards/thermo_reward/std": 2.4456305503845215, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.46875, "completions/mean_terminated_length": 270.46875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10754356998950243, "epoch": 0.674, "frac_reward_zero_std": 0.0, "grad_norm": 0.06656590104103088, "learning_rate": 1.93350381700654e-06, "loss": -0.0006, "num_tokens": 2922369.0, "reward": 11.799091339111328, "reward_std": 5.279061317443848, "rewards/fitness_reward/mean": 6.693284034729004, "rewards/fitness_reward/std": 2.6365268230438232, "rewards/kidney_reward/mean": 2.275409698486328, "rewards/kidney_reward/std": 1.1351674795150757, "rewards/length2tails_reward/mean": 0.633697509765625, "rewards/length2tails_reward/std": 0.3460710048675537, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.6670281887054443, "rewards/thermo_reward/std": 1.997921347618103, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.03125, "completions/mean_terminated_length": 270.03125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.11916785500943661, "epoch": 0.676, "frac_reward_zero_std": 0.0, "grad_norm": 0.06394181400537491, "learning_rate": 1.933043264819444e-06, "loss": -0.0055, "num_tokens": 2931042.0, "reward": 11.553443908691406, "reward_std": 3.8259716033935547, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.2776575088500977, "rewards/kidney_reward/std": 0.7665494084358215, "rewards/length2tails_reward/mean": 0.6201182007789612, "rewards/length2tails_reward/std": 0.35222068428993225, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.060720920562744, "rewards/thermo_reward/std": 2.3281824588775635, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.4375, "completions/mean_terminated_length": 271.4375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10639863647520542, "epoch": 0.678, "frac_reward_zero_std": 0.0, "grad_norm": 0.10773520171642303, "learning_rate": 1.932581178477138e-06, "loss": -0.0045, "num_tokens": 2939760.0, "reward": 12.213360786437988, "reward_std": 2.925508975982666, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.360633373260498, "rewards/kidney_reward/std": 0.6457472443580627, "rewards/length2tails_reward/mean": 0.700934648513794, "rewards/length2tails_reward/std": 0.3516809046268463, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.3789587020874023, "rewards/thermo_reward/std": 2.3317079544067383, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10935661476105452, "epoch": 0.68, "frac_reward_zero_std": 0.0, "grad_norm": 0.06685982644557953, "learning_rate": 1.9321175587394056e-06, "loss": -0.0029, "num_tokens": 2948488.0, "reward": 11.586135864257812, "reward_std": 5.158356666564941, "rewards/fitness_reward/mean": 6.88628625869751, "rewards/fitness_reward/std": 2.1171510219573975, "rewards/kidney_reward/mean": 2.1452465057373047, "rewards/kidney_reward/std": 1.2479134798049927, "rewards/length2tails_reward/mean": 0.7242114543914795, "rewards/length2tails_reward/std": 0.318882554769516, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.3821818828582764, "rewards/thermo_reward/std": 2.5958993434906006, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.84375, "completions/mean_terminated_length": 270.84375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.10775735974311829, "epoch": 0.682, "frac_reward_zero_std": 0.0, "grad_norm": 0.05980192869901657, "learning_rate": 1.9316524063685538e-06, "loss": 0.0041, "num_tokens": 2957187.0, "reward": 10.59376049041748, "reward_std": 7.244329452514648, "rewards/fitness_reward/mean": 6.2857866287231445, "rewards/fitness_reward/std": 3.397279977798462, "rewards/kidney_reward/mean": 2.0456247329711914, "rewards/kidney_reward/std": 1.749367356300354, "rewards/length2tails_reward/mean": 0.683518648147583, "rewards/length2tails_reward/std": 0.30251583456993103, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.0939974784851074, "rewards/thermo_reward/std": 2.440037488937378, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.71875, "completions/mean_terminated_length": 271.71875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10989144165068865, "epoch": 0.684, "frac_reward_zero_std": 0.0, "grad_norm": 0.07758535444736481, "learning_rate": 1.931185722129409e-06, "loss": -0.0046, "num_tokens": 2965914.0, "reward": 11.012580871582031, "reward_std": 5.557486057281494, "rewards/fitness_reward/mean": 6.334482192993164, "rewards/fitness_reward/std": 3.2543489933013916, "rewards/kidney_reward/mean": 2.2728075981140137, "rewards/kidney_reward/std": 0.9552741050720215, "rewards/length2tails_reward/mean": 0.7059061527252197, "rewards/length2tails_reward/std": 0.31230825185775757, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.2347002029418945, "rewards/thermo_reward/std": 2.149380922317505, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.78125, "completions/mean_terminated_length": 271.78125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11373002640902996, "epoch": 0.686, "frac_reward_zero_std": 0.0, "grad_norm": 0.2745387852191925, "learning_rate": 1.9307175067893163e-06, "loss": -0.0038, "num_tokens": 2974643.0, "reward": 12.919220924377441, "reward_std": 2.191603899002075, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.460087299346924, "rewards/kidney_reward/std": 0.6578894257545471, "rewards/length2tails_reward/mean": 0.6816491484642029, "rewards/length2tails_reward/std": 0.3293251395225525, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.929783582687378, "rewards/thermo_reward/std": 1.6954776048660278, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.3125, "completions/mean_terminated_length": 272.3125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10979206673800945, "epoch": 0.688, "frac_reward_zero_std": 0.0, "grad_norm": 0.16077430546283722, "learning_rate": 1.9302477611181375e-06, "loss": -0.0072, "num_tokens": 2983389.0, "reward": 10.854362487792969, "reward_std": 5.402705669403076, "rewards/fitness_reward/mean": 6.654323577880859, "rewards/fitness_reward/std": 2.782104015350342, "rewards/kidney_reward/mean": 2.0775442123413086, "rewards/kidney_reward/std": 1.285241961479187, "rewards/length2tails_reward/mean": 0.73471599817276, "rewards/length2tails_reward/std": 0.3409052789211273, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.9490220546722412, "rewards/thermo_reward/std": 2.4640939235687256, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.84375, "completions/mean_terminated_length": 271.84375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10362495668232441, "epoch": 0.69, "frac_reward_zero_std": 0.0, "grad_norm": 0.2713906466960907, "learning_rate": 1.929776485888251e-06, "loss": -0.0031, "num_tokens": 2992120.0, "reward": 9.594318389892578, "reward_std": 7.927021503448486, "rewards/fitness_reward/mean": 5.681684494018555, "rewards/fitness_reward/std": 3.97599196434021, "rewards/kidney_reward/mean": 1.7850568294525146, "rewards/kidney_reward/std": 1.7634572982788086, "rewards/length2tails_reward/mean": 0.6979485750198364, "rewards/length2tails_reward/std": 0.35325416922569275, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.9577827453613281, "rewards/thermo_reward/std": 2.700308084487915, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.46875, "completions/mean_terminated_length": 271.46875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.12234244868159294, "epoch": 0.692, "frac_reward_zero_std": 0.0, "grad_norm": 0.08907873183488846, "learning_rate": 1.9293036818745518e-06, "loss": 0.0007, "num_tokens": 3000839.0, "reward": 9.967710494995117, "reward_std": 7.248884201049805, "rewards/fitness_reward/mean": 6.2207746505737305, "rewards/fitness_reward/std": 3.420375108718872, "rewards/kidney_reward/mean": 1.7617230415344238, "rewards/kidney_reward/std": 1.7901252508163452, "rewards/length2tails_reward/mean": 0.6708717346191406, "rewards/length2tails_reward/std": 0.3487412929534912, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.8181260824203491, "rewards/thermo_reward/std": 2.7099850177764893, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.34375, "completions/mean_terminated_length": 271.34375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10335982125252485, "epoch": 0.694, "frac_reward_zero_std": 0.0, "grad_norm": 0.08128883689641953, "learning_rate": 1.9288293498544467e-06, "loss": 0.0053, "num_tokens": 3009554.0, "reward": 9.647581100463867, "reward_std": 7.206797122955322, "rewards/fitness_reward/mean": 5.884220123291016, "rewards/fitness_reward/std": 3.81189227104187, "rewards/kidney_reward/mean": 1.8862109184265137, "rewards/kidney_reward/std": 1.6567944288253784, "rewards/length2tails_reward/mean": 0.6482487916946411, "rewards/length2tails_reward/std": 0.37604981660842896, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.712324857711792, "rewards/thermo_reward/std": 2.719363212585449, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.71875, "completions/mean_terminated_length": 271.71875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09986438229680061, "epoch": 0.696, "frac_reward_zero_std": 0.0, "grad_norm": 0.08426442742347717, "learning_rate": 1.928353490607855e-06, "loss": -0.0001, "num_tokens": 3018281.0, "reward": 10.494661331176758, "reward_std": 6.4393086433410645, "rewards/fitness_reward/mean": 6.288505554199219, "rewards/fitness_reward/std": 3.211290121078491, "rewards/kidney_reward/mean": 1.9297271966934204, "rewards/kidney_reward/std": 1.5727407932281494, "rewards/length2tails_reward/mean": 0.7330443263053894, "rewards/length2tails_reward/std": 0.30768275260925293, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.103123664855957, "rewards/thermo_reward/std": 2.4181745052337646, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.625, "completions/mean_terminated_length": 270.625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10438127210363746, "epoch": 0.698, "frac_reward_zero_std": 0.0, "grad_norm": 0.745772659778595, "learning_rate": 1.9278761049172087e-06, "loss": -0.0061, "num_tokens": 3026973.0, "reward": 9.632065773010254, "reward_std": 7.670244216918945, "rewards/fitness_reward/mean": 5.907223224639893, "rewards/fitness_reward/std": 3.7570505142211914, "rewards/kidney_reward/mean": 1.6641534566879272, "rewards/kidney_reward/std": 2.020569086074829, "rewards/length2tails_reward/mean": 0.6299859285354614, "rewards/length2tails_reward/std": 0.3401232361793518, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.8976908922195435, "rewards/thermo_reward/std": 2.6126625537872314, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.84375, "completions/mean_terminated_length": 270.84375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10190891660749912, "epoch": 0.7, "frac_reward_zero_std": 0.0, "grad_norm": 0.0659673810005188, "learning_rate": 1.927397193567448e-06, "loss": -0.0023, "num_tokens": 3035672.0, "reward": 11.571775436401367, "reward_std": 4.5618414878845215, "rewards/fitness_reward/mean": 6.948239326477051, "rewards/fitness_reward/std": 2.026416063308716, "rewards/kidney_reward/mean": 2.2341322898864746, "rewards/kidney_reward/std": 1.0163410902023315, "rewards/length2tails_reward/mean": 0.6096740961074829, "rewards/length2tails_reward/std": 0.37831398844718933, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.2284371852874756, "rewards/thermo_reward/std": 2.161548137664795, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.71875, "completions/mean_terminated_length": 270.71875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11552073899656534, "epoch": 0.702, "frac_reward_zero_std": 0.0, "grad_norm": 0.5920554399490356, "learning_rate": 1.9269167573460217e-06, "loss": -0.0022, "num_tokens": 3044367.0, "reward": 10.978547096252441, "reward_std": 6.401198863983154, "rewards/fitness_reward/mean": 6.606268882751465, "rewards/fitness_reward/std": 2.7593371868133545, "rewards/kidney_reward/mean": 2.0434393882751465, "rewards/kidney_reward/std": 1.6172335147857666, "rewards/length2tails_reward/mean": 0.6069375276565552, "rewards/length2tails_reward/std": 0.37754398584365845, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.168145179748535, "rewards/thermo_reward/std": 2.4649102687835693, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.5, "completions/mean_terminated_length": 271.5, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11476656049489975, "epoch": 0.704, "frac_reward_zero_std": 0.0, "grad_norm": 0.06389094144105911, "learning_rate": 1.926434797042887e-06, "loss": -0.0016, "num_tokens": 3053087.0, "reward": 10.932390213012695, "reward_std": 5.4943342208862305, "rewards/fitness_reward/mean": 6.568968772888184, "rewards/fitness_reward/std": 2.8947854042053223, "rewards/kidney_reward/mean": 2.112169027328491, "rewards/kidney_reward/std": 1.2832504510879517, "rewards/length2tails_reward/mean": 0.685340166091919, "rewards/length2tails_reward/std": 0.3321627676486969, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.082718849182129, "rewards/thermo_reward/std": 2.573854446411133, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.46875, "completions/mean_terminated_length": 271.46875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.1000164458528161, "epoch": 0.706, "frac_reward_zero_std": 0.0, "grad_norm": 0.11736225336790085, "learning_rate": 1.9259513134505073e-06, "loss": -0.0043, "num_tokens": 3061806.0, "reward": 9.778669357299805, "reward_std": 6.8691511154174805, "rewards/fitness_reward/mean": 5.954867362976074, "rewards/fitness_reward/std": 3.625878095626831, "rewards/kidney_reward/mean": 1.931515097618103, "rewards/kidney_reward/std": 1.5763072967529297, "rewards/length2tails_reward/mean": 0.66201251745224, "rewards/length2tails_reward/std": 0.3817025125026703, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.7260853052139282, "rewards/thermo_reward/std": 2.684684991836548, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.53125, "completions/mean_terminated_length": 271.53125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10831321962177753, "epoch": 0.708, "frac_reward_zero_std": 0.0, "grad_norm": 0.5270610451698303, "learning_rate": 1.9254663073638492e-06, "loss": -0.0037, "num_tokens": 3070527.0, "reward": 11.321937561035156, "reward_std": 6.7053937911987305, "rewards/fitness_reward/mean": 6.6266045570373535, "rewards/fitness_reward/std": 2.8948702812194824, "rewards/kidney_reward/mean": 1.9176154136657715, "rewards/kidney_reward/std": 1.8548389673233032, "rewards/length2tails_reward/mean": 0.694791316986084, "rewards/length2tails_reward/std": 0.3296026885509491, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.608238458633423, "rewards/thermo_reward/std": 2.3834054470062256, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11527068726718426, "epoch": 0.71, "frac_reward_zero_std": 0.0, "grad_norm": 0.1380949169397354, "learning_rate": 1.924979779580383e-06, "loss": -0.004, "num_tokens": 3079275.0, "reward": 9.897174835205078, "reward_std": 7.3936848640441895, "rewards/fitness_reward/mean": 6.241214752197266, "rewards/fitness_reward/std": 3.3536489009857178, "rewards/kidney_reward/mean": 1.7082035541534424, "rewards/kidney_reward/std": 1.9595487117767334, "rewards/length2tails_reward/mean": 0.7306416630744934, "rewards/length2tails_reward/std": 0.3336907625198364, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.7746928930282593, "rewards/thermo_reward/std": 2.548712968826294, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 264.1875, "completions/mean_terminated_length": 264.1875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.11325905099511147, "epoch": 0.712, "frac_reward_zero_std": 0.0, "grad_norm": 1.0523838996887207, "learning_rate": 1.9244917309000816e-06, "loss": -0.0836, "num_tokens": 3087761.0, "reward": 9.5830078125, "reward_std": 7.340405464172363, "rewards/fitness_reward/mean": 6.4399871826171875, "rewards/fitness_reward/std": 2.9903438091278076, "rewards/kidney_reward/mean": 1.6315879821777344, "rewards/kidney_reward/std": 2.1313037872314453, "rewards/length2tails_reward/mean": 0.6992827653884888, "rewards/length2tails_reward/std": 0.29506129026412964, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.3415043354034424, "rewards/thermo_reward/std": 3.074310302734375, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.78125, "completions/mean_terminated_length": 270.78125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.1113341748714447, "epoch": 0.714, "frac_reward_zero_std": 0.0, "grad_norm": 0.057726092636585236, "learning_rate": 1.9240021621254186e-06, "loss": -0.0009, "num_tokens": 3096458.0, "reward": 11.602544784545898, "reward_std": 4.918376445770264, "rewards/fitness_reward/mean": 6.883650779724121, "rewards/fitness_reward/std": 2.0794548988342285, "rewards/kidney_reward/mean": 2.2405173778533936, "rewards/kidney_reward/std": 1.1649236679077148, "rewards/length2tails_reward/mean": 0.6127895712852478, "rewards/length2tails_reward/std": 0.34780457615852356, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.317099094390869, "rewards/thermo_reward/std": 2.3603625297546387, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.03125, "completions/mean_terminated_length": 270.03125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10471488256007433, "epoch": 0.716, "frac_reward_zero_std": 0.0, "grad_norm": 0.13354143500328064, "learning_rate": 1.9235110740613667e-06, "loss": -0.0041, "num_tokens": 3105131.0, "reward": 11.289546966552734, "reward_std": 6.243673324584961, "rewards/fitness_reward/mean": 6.612612724304199, "rewards/fitness_reward/std": 2.7366530895233154, "rewards/kidney_reward/mean": 2.1773085594177246, "rewards/kidney_reward/std": 1.4662457704544067, "rewards/length2tails_reward/mean": 0.5935394763946533, "rewards/length2tails_reward/std": 0.36415228247642517, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.340271472930908, "rewards/thermo_reward/std": 2.4543654918670654, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.28125, "completions/mean_terminated_length": 270.28125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10732367634773254, "epoch": 0.718, "frac_reward_zero_std": 0.0, "grad_norm": 0.07978679984807968, "learning_rate": 1.9230184675153973e-06, "loss": -0.0057, "num_tokens": 3113812.0, "reward": 10.384027481079102, "reward_std": 6.627580165863037, "rewards/fitness_reward/mean": 6.309484481811523, "rewards/fitness_reward/std": 3.155627727508545, "rewards/kidney_reward/mean": 1.8931320905685425, "rewards/kidney_reward/std": 1.6427026987075806, "rewards/length2tails_reward/mean": 0.6456698179244995, "rewards/length2tails_reward/std": 0.36801210045814514, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 2.0230941772460938, "rewards/thermo_reward/std": 2.5650124549865723, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.59375, "completions/mean_terminated_length": 270.59375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10083027090877295, "epoch": 0.72, "frac_reward_zero_std": 0.0, "grad_norm": 0.0760909765958786, "learning_rate": 1.9225243432974772e-06, "loss": -0.0039, "num_tokens": 3122503.0, "reward": 11.693754196166992, "reward_std": 3.8968091011047363, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.452592670917511, "rewards/kidney_reward/mean": 2.171109199523926, "rewards/kidney_reward/std": 1.2216949462890625, "rewards/length2tails_reward/mean": 0.5941706299781799, "rewards/length2tails_reward/std": 0.35726645588874817, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.117060661315918, "rewards/thermo_reward/std": 2.537241220474243, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.5, "completions/mean_terminated_length": 270.5, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10586319025605917, "epoch": 0.722, "frac_reward_zero_std": 0.0, "grad_norm": 0.09383681416511536, "learning_rate": 1.9220287022200707e-06, "loss": -0.0049, "num_tokens": 3131191.0, "reward": 11.630992889404297, "reward_std": 5.257022380828857, "rewards/fitness_reward/mean": 6.733970642089844, "rewards/fitness_reward/std": 2.468461513519287, "rewards/kidney_reward/mean": 2.2140183448791504, "rewards/kidney_reward/std": 1.1709821224212646, "rewards/length2tails_reward/mean": 0.6167657375335693, "rewards/length2tails_reward/std": 0.37922364473342896, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.521327257156372, "rewards/thermo_reward/std": 2.263733148574829, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 273.625, "completions/mean_terminated_length": 273.625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.11059287562966347, "epoch": 0.724, "frac_reward_zero_std": 0.0, "grad_norm": 0.11690595746040344, "learning_rate": 1.9215315450981336e-06, "loss": -0.0029, "num_tokens": 3139979.0, "reward": 11.296387672424316, "reward_std": 5.4516825675964355, "rewards/fitness_reward/mean": 6.644530296325684, "rewards/fitness_reward/std": 2.60485577583313, "rewards/kidney_reward/mean": 2.0352516174316406, "rewards/kidney_reward/std": 1.3362797498703003, "rewards/length2tails_reward/mean": 0.7619137763977051, "rewards/length2tails_reward/std": 0.327181875705719, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.4404149055480957, "rewards/thermo_reward/std": 2.0719480514526367, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10017055552452803, "epoch": 0.726, "frac_reward_zero_std": 0.0, "grad_norm": 0.16517867147922516, "learning_rate": 1.9210328727491173e-06, "loss": -0.002, "num_tokens": 3148695.0, "reward": 10.863248825073242, "reward_std": 6.320736408233643, "rewards/fitness_reward/mean": 6.588662147521973, "rewards/fitness_reward/std": 2.817298412322998, "rewards/kidney_reward/mean": 2.011671304702759, "rewards/kidney_reward/std": 1.5752159357070923, "rewards/length2tails_reward/mean": 0.651005744934082, "rewards/length2tails_reward/std": 0.37302064895629883, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.0978143215179443, "rewards/thermo_reward/std": 2.426593542098999, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.75, "completions/mean_terminated_length": 269.75, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10572062712162733, "epoch": 0.728, "frac_reward_zero_std": 0.0, "grad_norm": 0.5161871314048767, "learning_rate": 1.920532685992962e-06, "loss": -0.0005, "num_tokens": 3157359.0, "reward": 10.26416015625, "reward_std": 6.127639293670654, "rewards/fitness_reward/mean": 6.581520080566406, "rewards/fitness_reward/std": 2.626322031021118, "rewards/kidney_reward/mean": 1.7820661067962646, "rewards/kidney_reward/std": 1.6363115310668945, "rewards/length2tails_reward/mean": 0.553253173828125, "rewards/length2tails_reward/std": 0.39293044805526733, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.745248794555664, "rewards/thermo_reward/std": 2.6991000175476074, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.6875, "completions/mean_terminated_length": 271.6875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.1089264303445816, "epoch": 0.73, "frac_reward_zero_std": 0.0, "grad_norm": 1.0864341259002686, "learning_rate": 1.9200309856520996e-06, "loss": -0.004, "num_tokens": 3166085.0, "reward": 12.792705535888672, "reward_std": 2.40315580368042, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.394333600997925, "rewards/kidney_reward/std": 0.8731848001480103, "rewards/length2tails_reward/mean": 0.62489914894104, "rewards/length2tails_reward/std": 0.3941367268562317, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.874697208404541, "rewards/thermo_reward/std": 1.6326818466186523, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 269.78125, "completions/mean_terminated_length": 269.78125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11107066180557013, "epoch": 0.732, "frac_reward_zero_std": 0.0, "grad_norm": 0.11344432085752487, "learning_rate": 1.9195277725514506e-06, "loss": -0.0078, "num_tokens": 3174750.0, "reward": 11.069419860839844, "reward_std": 4.182448387145996, "rewards/fitness_reward/mean": 6.938035011291504, "rewards/fitness_reward/std": 1.840762972831726, "rewards/kidney_reward/mean": 2.2398624420166016, "rewards/kidney_reward/std": 1.0455249547958374, "rewards/length2tails_reward/mean": 0.5537076592445374, "rewards/length2tails_reward/std": 0.387100487947464, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.7361513376235962, "rewards/thermo_reward/std": 2.2292439937591553, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.9375, "completions/mean_terminated_length": 271.9375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11792270187288523, "epoch": 0.734, "frac_reward_zero_std": 0.0, "grad_norm": 0.210047647356987, "learning_rate": 1.9190230475184223e-06, "loss": -0.0064, "num_tokens": 3183484.0, "reward": 11.371177673339844, "reward_std": 4.733616828918457, "rewards/fitness_reward/mean": 6.433873176574707, "rewards/fitness_reward/std": 2.9292948246002197, "rewards/kidney_reward/mean": 2.1523091793060303, "rewards/kidney_reward/std": 0.984671413898468, "rewards/length2tails_reward/mean": 0.7043337821960449, "rewards/length2tails_reward/std": 0.3285147249698639, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.6145615577697754, "rewards/thermo_reward/std": 1.4447458982467651, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.0, "completions/mean_terminated_length": 271.0, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10720256343483925, "epoch": 0.736, "frac_reward_zero_std": 0.0, "grad_norm": 0.11020854860544205, "learning_rate": 1.9185168113829076e-06, "loss": -0.0084, "num_tokens": 3192188.0, "reward": 11.898595809936523, "reward_std": 2.5480897426605225, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.449726104736328, "rewards/kidney_reward/std": 0.5813888907432556, "rewards/length2tails_reward/mean": 0.6259234547615051, "rewards/length2tails_reward/std": 0.3515353500843048, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.9826017618179321, "rewards/thermo_reward/std": 2.1645750999450684, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.78125, "completions/mean_terminated_length": 270.78125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10751251876354218, "epoch": 0.738, "frac_reward_zero_std": 0.0, "grad_norm": 0.09786931425333023, "learning_rate": 1.9180090649772854e-06, "loss": -0.0014, "num_tokens": 3200885.0, "reward": 7.86342191696167, "reward_std": 9.2483491897583, "rewards/fitness_reward/mean": 4.914643287658691, "rewards/fitness_reward/std": 4.593165397644043, "rewards/kidney_reward/mean": 1.3660173416137695, "rewards/kidney_reward/std": 2.202097177505493, "rewards/length2tails_reward/mean": 0.645307183265686, "rewards/length2tails_reward/std": 0.36554959416389465, "rewards/repeated_in_batch_reward/mean": 0.90625, "rewards/repeated_in_batch_reward/std": 0.2961445748806, "rewards/thermo_reward/mean": 1.427605390548706, "rewards/thermo_reward/std": 2.9483962059020996, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.59375, "completions/mean_terminated_length": 271.59375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.1021569799631834, "epoch": 0.74, "frac_reward_zero_std": 0.0, "grad_norm": 0.18761053681373596, "learning_rate": 1.9174998091364167e-06, "loss": -0.0076, "num_tokens": 3209608.0, "reward": 9.419897079467773, "reward_std": 7.5619916915893555, "rewards/fitness_reward/mean": 6.109926223754883, "rewards/fitness_reward/std": 3.258395195007324, "rewards/kidney_reward/mean": 1.5979440212249756, "rewards/kidney_reward/std": 2.025669574737549, "rewards/length2tails_reward/mean": 0.6712652444839478, "rewards/length2tails_reward/std": 0.37890389561653137, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.5449011325836182, "rewards/thermo_reward/std": 2.8237903118133545, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.0625, "completions/mean_terminated_length": 270.0625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10275615192949772, "epoch": 0.742, "frac_reward_zero_std": 0.0, "grad_norm": 0.17681962251663208, "learning_rate": 1.9169890446976452e-06, "loss": -0.0044, "num_tokens": 3218282.0, "reward": 8.854126930236816, "reward_std": 7.579990863800049, "rewards/fitness_reward/mean": 5.729085922241211, "rewards/fitness_reward/std": 3.860947608947754, "rewards/kidney_reward/mean": 1.782204508781433, "rewards/kidney_reward/std": 1.6522729396820068, "rewards/length2tails_reward/mean": 0.5530422925949097, "rewards/length2tails_reward/std": 0.39241811633110046, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 1.1937828063964844, "rewards/thermo_reward/std": 3.0032665729522705, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.5, "completions/mean_terminated_length": 270.5, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10601808875799179, "epoch": 0.744, "frac_reward_zero_std": 0.0, "grad_norm": 0.1457470804452896, "learning_rate": 1.916476772500794e-06, "loss": -0.0058, "num_tokens": 3226970.0, "reward": 12.296627044677734, "reward_std": 2.8065128326416016, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.430830955505371, "rewards/kidney_reward/std": 0.6787682175636292, "rewards/length2tails_reward/mean": 0.6328158378601074, "rewards/length2tails_reward/std": 0.3681427538394928, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.398838520050049, "rewards/thermo_reward/std": 2.134150981903076, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 262.84375, "completions/mean_terminated_length": 262.84375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.10207627713680267, "epoch": 0.746, "frac_reward_zero_std": 0.0, "grad_norm": 0.3557395040988922, "learning_rate": 1.9159629933881667e-06, "loss": -0.1238, "num_tokens": 3235413.0, "reward": 11.494861602783203, "reward_std": 4.767448902130127, "rewards/fitness_reward/mean": 6.9528045654296875, "rewards/fitness_reward/std": 2.0009288787841797, "rewards/kidney_reward/mean": 2.1724929809570312, "rewards/kidney_reward/std": 1.3247390985488892, "rewards/length2tails_reward/mean": 0.5239368677139282, "rewards/length2tails_reward/std": 0.39487963914871216, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.2171709537506104, "rewards/thermo_reward/std": 2.2351832389831543, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 265.09375, "completions/mean_terminated_length": 265.09375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.10754709504544735, "epoch": 0.748, "frac_reward_zero_std": 0.0, "grad_norm": 0.6204034686088562, "learning_rate": 1.9154477082045434e-06, "loss": -0.0661, "num_tokens": 3243928.0, "reward": 8.130387306213379, "reward_std": 8.025690078735352, "rewards/fitness_reward/mean": 5.279109954833984, "rewards/fitness_reward/std": 4.192902088165283, "rewards/kidney_reward/mean": 1.303051233291626, "rewards/kidney_reward/std": 2.06316876411438, "rewards/length2tails_reward/mean": 0.6537559032440186, "rewards/length2tails_reward/std": 0.36036211252212524, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.3828513622283936, "rewards/thermo_reward/std": 2.7855420112609863, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.125, "completions/mean_terminated_length": 271.125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10372466966509819, "epoch": 0.75, "frac_reward_zero_std": 0.0, "grad_norm": 0.20361915230751038, "learning_rate": 1.914930917797181e-06, "loss": -0.0033, "num_tokens": 3252636.0, "reward": 9.382475852966309, "reward_std": 8.167882919311523, "rewards/fitness_reward/mean": 5.807817459106445, "rewards/fitness_reward/std": 3.692617893218994, "rewards/kidney_reward/mean": 1.5686777830123901, "rewards/kidney_reward/std": 2.1148335933685303, "rewards/length2tails_reward/mean": 0.6882462501525879, "rewards/length2tails_reward/std": 0.33273714780807495, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 1.8434056043624878, "rewards/thermo_reward/std": 2.7445905208587646, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.625, "completions/mean_terminated_length": 271.625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11489119380712509, "epoch": 0.752, "frac_reward_zero_std": 0.0, "grad_norm": 1.2641088962554932, "learning_rate": 1.9144126230158124e-06, "loss": -0.0028, "num_tokens": 3261360.0, "reward": 10.802129745483398, "reward_std": 5.9253644943237305, "rewards/fitness_reward/mean": 6.857111930847168, "rewards/fitness_reward/std": 2.2262284755706787, "rewards/kidney_reward/mean": 1.873659372329712, "rewards/kidney_reward/std": 1.7289634943008423, "rewards/length2tails_reward/mean": 0.6913347840309143, "rewards/length2tails_reward/std": 0.3142402470111847, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.9022259712219238, "rewards/thermo_reward/std": 2.595896005630493, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.5, "completions/mean_terminated_length": 269.5, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.10804861318320036, "epoch": 0.754, "frac_reward_zero_std": 0.0, "grad_norm": 0.3748019337654114, "learning_rate": 1.913892824712642e-06, "loss": -0.0373, "num_tokens": 3270016.0, "reward": 9.500389099121094, "reward_std": 8.477140426635742, "rewards/fitness_reward/mean": 5.897690773010254, "rewards/fitness_reward/std": 3.9426934719085693, "rewards/kidney_reward/mean": 1.6560649871826172, "rewards/kidney_reward/std": 2.092923641204834, "rewards/length2tails_reward/mean": 0.7651374936103821, "rewards/length2tails_reward/std": 0.31899499893188477, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.770119547843933, "rewards/thermo_reward/std": 2.8880178928375244, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.46875, "completions/mean_terminated_length": 270.46875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10575198102742434, "epoch": 0.756, "frac_reward_zero_std": 0.0, "grad_norm": 0.09856077283620834, "learning_rate": 1.9133715237423485e-06, "loss": -0.0107, "num_tokens": 3278703.0, "reward": 11.312349319458008, "reward_std": 3.7839372158050537, "rewards/fitness_reward/mean": 7.188657760620117, "rewards/fitness_reward/std": 0.7179933190345764, "rewards/kidney_reward/mean": 2.090399742126465, "rewards/kidney_reward/std": 1.1582475900650024, "rewards/length2tails_reward/mean": 0.5980393290519714, "rewards/length2tails_reward/std": 0.3704410791397095, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.8734878301620483, "rewards/thermo_reward/std": 2.3937807083129883, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.15625, "completions/mean_terminated_length": 271.15625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10157476831227541, "epoch": 0.758, "frac_reward_zero_std": 0.0, "grad_norm": 0.08300302922725677, "learning_rate": 1.91284872096208e-06, "loss": -0.0003, "num_tokens": 3287412.0, "reward": 10.58859920501709, "reward_std": 5.9171247482299805, "rewards/fitness_reward/mean": 6.681866645812988, "rewards/fitness_reward/std": 2.685343027114868, "rewards/kidney_reward/mean": 1.87894606590271, "rewards/kidney_reward/std": 1.5297423601150513, "rewards/length2tails_reward/mean": 0.6118855476379395, "rewards/length2tails_reward/std": 0.3672679364681244, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.8665984869003296, "rewards/thermo_reward/std": 2.7235138416290283, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.125, "completions/mean_terminated_length": 271.125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10028427001088858, "epoch": 0.76, "frac_reward_zero_std": 0.0, "grad_norm": 0.11919562518596649, "learning_rate": 1.912324417231454e-06, "loss": 0.0052, "num_tokens": 3296120.0, "reward": 8.564162254333496, "reward_std": 9.746264457702637, "rewards/fitness_reward/mean": 5.225951194763184, "rewards/fitness_reward/std": 4.518463611602783, "rewards/kidney_reward/mean": 1.458353042602539, "rewards/kidney_reward/std": 2.350266933441162, "rewards/length2tails_reward/mean": 0.6691723465919495, "rewards/length2tails_reward/std": 0.3536635637283325, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.7129408121109009, "rewards/thermo_reward/std": 3.088613748550415, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.09375, "completions/mean_terminated_length": 272.09375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11146265547722578, "epoch": 0.762, "frac_reward_zero_std": 0.0, "grad_norm": 0.2094751000404358, "learning_rate": 1.9117986134125567e-06, "loss": 0.0036, "num_tokens": 3304859.0, "reward": 11.463823318481445, "reward_std": 6.830804824829102, "rewards/fitness_reward/mean": 6.590934753417969, "rewards/fitness_reward/std": 3.0370945930480957, "rewards/kidney_reward/mean": 2.0719785690307617, "rewards/kidney_reward/std": 1.6950889825820923, "rewards/length2tails_reward/mean": 0.7252353429794312, "rewards/length2tails_reward/std": 0.32576078176498413, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.628385543823242, "rewards/thermo_reward/std": 2.4370651245117188, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.4375, "completions/mean_terminated_length": 271.4375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11498783808201551, "epoch": 0.764, "frac_reward_zero_std": 0.0, "grad_norm": 0.1258348971605301, "learning_rate": 1.91127131036994e-06, "loss": 0.0005, "num_tokens": 3313577.0, "reward": 10.639206886291504, "reward_std": 6.834832668304443, "rewards/fitness_reward/mean": 6.58353853225708, "rewards/fitness_reward/std": 2.860297918319702, "rewards/kidney_reward/mean": 1.8589577674865723, "rewards/kidney_reward/std": 1.873213768005371, "rewards/length2tails_reward/mean": 0.671999990940094, "rewards/length2tails_reward/std": 0.32619988918304443, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.029510259628296, "rewards/thermo_reward/std": 2.685245990753174, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.6875, "completions/mean_terminated_length": 270.6875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10606464743614197, "epoch": 0.766, "frac_reward_zero_std": 0.0, "grad_norm": 0.05109286680817604, "learning_rate": 1.9107425089706216e-06, "loss": -0.0047, "num_tokens": 3322271.0, "reward": 12.418526649475098, "reward_std": 3.3464503288269043, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.389791488647461, "rewards/kidney_reward/std": 0.7344213128089905, "rewards/length2tails_reward/mean": 0.6418942213058472, "rewards/length2tails_reward/std": 0.3552281856536865, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8114919662475586, "rewards/thermo_reward/std": 1.5152373313903809, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.4375, "completions/mean_terminated_length": 270.4375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11245891731232405, "epoch": 0.768, "frac_reward_zero_std": 0.0, "grad_norm": 0.07938886433839798, "learning_rate": 1.9102122100840824e-06, "loss": 0.0009, "num_tokens": 3330957.0, "reward": 10.211996078491211, "reward_std": 6.531548976898193, "rewards/fitness_reward/mean": 5.886880874633789, "rewards/fitness_reward/std": 3.6463463306427, "rewards/kidney_reward/mean": 1.9389326572418213, "rewards/kidney_reward/std": 1.5082110166549683, "rewards/length2tails_reward/mean": 0.6228433847427368, "rewards/length2tails_reward/std": 0.3479934334754944, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.223897933959961, "rewards/thermo_reward/std": 2.420728921890259, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10842283070087433, "epoch": 0.77, "frac_reward_zero_std": 0.0, "grad_norm": 0.1542162448167801, "learning_rate": 1.9096804145822653e-06, "loss": -0.0018, "num_tokens": 3339701.0, "reward": 11.679430961608887, "reward_std": 4.462829113006592, "rewards/fitness_reward/mean": 6.991600036621094, "rewards/fitness_reward/std": 1.7847418785095215, "rewards/kidney_reward/mean": 2.2454633712768555, "rewards/kidney_reward/std": 1.1345385313034058, "rewards/length2tails_reward/mean": 0.7023622989654541, "rewards/length2tails_reward/std": 0.35093632340431213, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.2721309661865234, "rewards/thermo_reward/std": 2.254243850708008, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.96875, "completions/mean_terminated_length": 270.96875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10247174184769392, "epoch": 0.772, "frac_reward_zero_std": 0.0, "grad_norm": 0.14088068902492523, "learning_rate": 1.9091471233395747e-06, "loss": 0.0021, "num_tokens": 3348404.0, "reward": 10.934767723083496, "reward_std": 6.660598278045654, "rewards/fitness_reward/mean": 6.2669243812561035, "rewards/fitness_reward/std": 3.459750175476074, "rewards/kidney_reward/mean": 2.1590218544006348, "rewards/kidney_reward/std": 1.5328222513198853, "rewards/length2tails_reward/mean": 0.6023874282836914, "rewards/length2tails_reward/std": 0.37724438309669495, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.3485822677612305, "rewards/thermo_reward/std": 2.554950475692749, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.84375, "completions/mean_terminated_length": 271.84375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11835763789713383, "epoch": 0.774, "frac_reward_zero_std": 0.0, "grad_norm": 0.6285804510116577, "learning_rate": 1.9086123372328743e-06, "loss": 0.0016, "num_tokens": 3357135.0, "reward": 11.003572463989258, "reward_std": 6.561087608337402, "rewards/fitness_reward/mean": 6.622054576873779, "rewards/fitness_reward/std": 2.910947799682617, "rewards/kidney_reward/mean": 2.1000421047210693, "rewards/kidney_reward/std": 1.6415328979492188, "rewards/length2tails_reward/mean": 0.7079252004623413, "rewards/length2tails_reward/std": 0.2862622141838074, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.110682964324951, "rewards/thermo_reward/std": 2.5230486392974854, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.8125, "completions/mean_terminated_length": 271.8125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11549379862844944, "epoch": 0.776, "frac_reward_zero_std": 0.0, "grad_norm": 0.20497001707553864, "learning_rate": 1.9080760571414853e-06, "loss": 0.0005, "num_tokens": 3365865.0, "reward": 11.762812614440918, "reward_std": 3.7451224327087402, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.2056479454040527, "rewards/kidney_reward/std": 0.9120680689811707, "rewards/length2tails_reward/mean": 0.709801197052002, "rewards/length2tails_reward/std": 0.3223036825656891, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.3331310749053955, "rewards/thermo_reward/std": 1.9911855459213257, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.3125, "completions/mean_terminated_length": 271.3125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10378851741552353, "epoch": 0.778, "frac_reward_zero_std": 0.0, "grad_norm": 0.09145166724920273, "learning_rate": 1.9075382839471853e-06, "loss": -0.0065, "num_tokens": 3374579.0, "reward": 11.425798416137695, "reward_std": 3.9687082767486572, "rewards/fitness_reward/mean": 6.994507789611816, "rewards/fitness_reward/std": 1.7685731649398804, "rewards/kidney_reward/mean": 2.1150007247924805, "rewards/kidney_reward/std": 1.1914541721343994, "rewards/length2tails_reward/mean": 0.6470258235931396, "rewards/length2tails_reward/std": 0.3857588469982147, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.151587963104248, "rewards/thermo_reward/std": 2.378540277481079, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.59375, "completions/mean_terminated_length": 270.59375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10269285459071398, "epoch": 0.78, "frac_reward_zero_std": 0.0, "grad_norm": 0.1557752788066864, "learning_rate": 1.9069990185342073e-06, "loss": -0.0039, "num_tokens": 3383270.0, "reward": 10.588491439819336, "reward_std": 5.960216045379639, "rewards/fitness_reward/mean": 6.198329925537109, "rewards/fitness_reward/std": 3.3503475189208984, "rewards/kidney_reward/mean": 2.1871232986450195, "rewards/kidney_reward/std": 1.2717788219451904, "rewards/length2tails_reward/mean": 0.6088348627090454, "rewards/length2tails_reward/std": 0.3780163526535034, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.0421547889709473, "rewards/thermo_reward/std": 2.5141873359680176, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.78125, "completions/mean_terminated_length": 270.78125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10547183640301228, "epoch": 0.782, "frac_reward_zero_std": 0.0, "grad_norm": 0.12822791934013367, "learning_rate": 1.906458261789238e-06, "loss": 0.0014, "num_tokens": 3391967.0, "reward": 11.095891952514648, "reward_std": 6.153520584106445, "rewards/fitness_reward/mean": 6.617605209350586, "rewards/fitness_reward/std": 2.7073137760162354, "rewards/kidney_reward/mean": 2.1009960174560547, "rewards/kidney_reward/std": 1.5732152462005615, "rewards/length2tails_reward/mean": 0.6067447662353516, "rewards/length2tails_reward/std": 0.3467532694339752, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.216616153717041, "rewards/thermo_reward/std": 2.4701502323150635, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 271.3125, "completions/mean_terminated_length": 271.3125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10272944066673517, "epoch": 0.784, "frac_reward_zero_std": 0.0, "grad_norm": 0.06617357581853867, "learning_rate": 1.905916014601416e-06, "loss": -0.0051, "num_tokens": 3400681.0, "reward": 11.105353355407715, "reward_std": 5.018069744110107, "rewards/fitness_reward/mean": 6.5859375, "rewards/fitness_reward/std": 2.8291032314300537, "rewards/kidney_reward/mean": 2.181995153427124, "rewards/kidney_reward/std": 1.045772671699524, "rewards/length2tails_reward/mean": 0.6834899187088013, "rewards/length2tails_reward/std": 0.3619054853916168, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.169072151184082, "rewards/thermo_reward/std": 2.2546708583831787, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.1875, "completions/mean_terminated_length": 271.1875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10134368669241667, "epoch": 0.786, "frac_reward_zero_std": 0.0, "grad_norm": 0.12521199882030487, "learning_rate": 1.9053722778623303e-06, "loss": -0.0058, "num_tokens": 3409391.0, "reward": 10.825408935546875, "reward_std": 5.762734413146973, "rewards/fitness_reward/mean": 6.575164794921875, "rewards/fitness_reward/std": 2.658289670944214, "rewards/kidney_reward/mean": 2.0606515407562256, "rewards/kidney_reward/std": 1.5197820663452148, "rewards/length2tails_reward/mean": 0.6492632627487183, "rewards/length2tails_reward/std": 0.3747102618217468, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.0246660709381104, "rewards/thermo_reward/std": 2.344879627227783, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.78125, "completions/mean_terminated_length": 270.78125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11077911220490932, "epoch": 0.788, "frac_reward_zero_std": 0.0, "grad_norm": 0.09796229749917984, "learning_rate": 1.9048270524660196e-06, "loss": -0.0025, "num_tokens": 3418088.0, "reward": 10.854618072509766, "reward_std": 5.408969879150391, "rewards/fitness_reward/mean": 6.669445991516113, "rewards/fitness_reward/std": 2.502680540084839, "rewards/kidney_reward/mean": 2.0772039890289307, "rewards/kidney_reward/std": 1.2458466291427612, "rewards/length2tails_reward/mean": 0.6614983081817627, "rewards/length2tails_reward/std": 0.337128609418869, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.9418175220489502, "rewards/thermo_reward/std": 2.4592199325561523, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 269.46875, "completions/mean_terminated_length": 269.46875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09399615973234177, "epoch": 0.79, "frac_reward_zero_std": 0.0, "grad_norm": 0.17700549960136414, "learning_rate": 1.9042803393089697e-06, "loss": -0.0078, "num_tokens": 3426743.0, "reward": 10.897236824035645, "reward_std": 6.1277265548706055, "rewards/fitness_reward/mean": 6.1432342529296875, "rewards/fitness_reward/std": 3.33506441116333, "rewards/kidney_reward/mean": 2.0403788089752197, "rewards/kidney_reward/std": 1.4263689517974854, "rewards/length2tails_reward/mean": 0.5339868068695068, "rewards/length2tails_reward/std": 0.36498579382896423, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.560225009918213, "rewards/thermo_reward/std": 2.0816314220428467, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.0625, "completions/mean_terminated_length": 272.0625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09640540461987257, "epoch": 0.792, "frac_reward_zero_std": 0.0, "grad_norm": 0.05166810750961304, "learning_rate": 1.9037321392901133e-06, "loss": -0.0002, "num_tokens": 3435481.0, "reward": 11.834596633911133, "reward_std": 4.384562969207764, "rewards/fitness_reward/mean": 7.010133266448975, "rewards/fitness_reward/std": 1.9858490228652954, "rewards/kidney_reward/mean": 2.284642219543457, "rewards/kidney_reward/std": 1.0346956253051758, "rewards/length2tails_reward/mean": 0.6679773330688477, "rewards/length2tails_reward/std": 0.3801604211330414, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.373023748397827, "rewards/thermo_reward/std": 1.8982181549072266, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.65625, "completions/mean_terminated_length": 270.65625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10857865959405899, "epoch": 0.794, "frac_reward_zero_std": 0.0, "grad_norm": 0.10565046966075897, "learning_rate": 1.9031824533108277e-06, "loss": -0.0042, "num_tokens": 3444174.0, "reward": 11.789709091186523, "reward_std": 4.957429885864258, "rewards/fitness_reward/mean": 6.921901226043701, "rewards/fitness_reward/std": 1.926413655281067, "rewards/kidney_reward/mean": 2.1898322105407715, "rewards/kidney_reward/std": 1.3164691925048828, "rewards/length2tails_reward/mean": 0.6206187605857849, "rewards/length2tails_reward/std": 0.3860568404197693, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.51591420173645, "rewards/thermo_reward/std": 2.3009755611419678, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 273.0, "completions/mean_terminated_length": 273.0, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10428552236407995, "epoch": 0.796, "frac_reward_zero_std": 0.0, "grad_norm": 0.9575762152671814, "learning_rate": 1.9026312822749331e-06, "loss": 0.0176, "num_tokens": 3452942.0, "reward": 10.670546531677246, "reward_std": 5.758018970489502, "rewards/fitness_reward/mean": 6.642149925231934, "rewards/fitness_reward/std": 2.614790201187134, "rewards/kidney_reward/mean": 1.78218674659729, "rewards/kidney_reward/std": 1.4918310642242432, "rewards/length2tails_reward/mean": 0.6436346769332886, "rewards/length2tails_reward/std": 0.33519431948661804, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.0818464756011963, "rewards/thermo_reward/std": 2.5326437950134277, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.9375, "completions/mean_terminated_length": 270.9375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11991294380277395, "epoch": 0.798, "frac_reward_zero_std": 0.0, "grad_norm": 0.10349400341510773, "learning_rate": 1.902078627088692e-06, "loss": 0.002, "num_tokens": 3461644.0, "reward": 11.47077751159668, "reward_std": 5.410183429718018, "rewards/fitness_reward/mean": 6.689011573791504, "rewards/fitness_reward/std": 2.654714345932007, "rewards/kidney_reward/mean": 2.2360825538635254, "rewards/kidney_reward/std": 1.1814593076705933, "rewards/length2tails_reward/mean": 0.6174130439758301, "rewards/length2tails_reward/std": 0.3640681803226471, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.3839423656463623, "rewards/thermo_reward/std": 2.1441264152526855, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.09375, "completions/mean_terminated_length": 272.09375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.11858357861638069, "epoch": 0.8, "frac_reward_zero_std": 0.0, "grad_norm": 0.11466143280267715, "learning_rate": 1.9015244886608068e-06, "loss": -0.0015, "num_tokens": 3470383.0, "reward": 11.97624397277832, "reward_std": 3.287630081176758, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.224416971206665, "rewards/kidney_reward/std": 1.1031237840652466, "rewards/length2tails_reward/mean": 0.7445105314254761, "rewards/length2tails_reward/std": 0.2538454234600067, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.2736995220184326, "rewards/thermo_reward/std": 2.2119221687316895, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.34375, "completions/mean_terminated_length": 269.34375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09968282468616962, "epoch": 0.802, "frac_reward_zero_std": 0.0, "grad_norm": 0.17062100768089294, "learning_rate": 1.9009688679024189e-06, "loss": -0.0031, "num_tokens": 3479034.0, "reward": 10.030776977539062, "reward_std": 6.229125499725342, "rewards/fitness_reward/mean": 6.334397315979004, "rewards/fitness_reward/std": 3.067559003829956, "rewards/kidney_reward/mean": 1.9138509035110474, "rewards/kidney_reward/std": 1.6205497980117798, "rewards/length2tails_reward/mean": 0.5182558298110962, "rewards/length2tails_reward/std": 0.36070799827575684, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 1.6369529962539673, "rewards/thermo_reward/std": 2.5060691833496094, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.4375, "completions/mean_terminated_length": 270.4375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0993448942899704, "epoch": 0.804, "frac_reward_zero_std": 0.0, "grad_norm": 0.06334397196769714, "learning_rate": 1.9004117657271075e-06, "loss": -0.004, "num_tokens": 3487720.0, "reward": 12.601202964782715, "reward_std": 2.6943719387054443, "rewards/fitness_reward/mean": 7.011618614196777, "rewards/fitness_reward/std": 1.977447509765625, "rewards/kidney_reward/mean": 2.4896838665008545, "rewards/kidney_reward/std": 0.3229711949825287, "rewards/length2tails_reward/mean": 0.6432620882987976, "rewards/length2tails_reward/std": 0.30116140842437744, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9355742931365967, "rewards/thermo_reward/std": 1.4401322603225708, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 270.59375, "completions/mean_terminated_length": 270.59375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10525709856301546, "epoch": 0.806, "frac_reward_zero_std": 0.0, "grad_norm": 0.21384942531585693, "learning_rate": 1.8998531830508867e-06, "loss": -0.001, "num_tokens": 3496411.0, "reward": 11.023578643798828, "reward_std": 5.368097305297852, "rewards/fitness_reward/mean": 6.832054615020752, "rewards/fitness_reward/std": 2.111349105834961, "rewards/kidney_reward/mean": 2.1173434257507324, "rewards/kidney_reward/std": 1.3727282285690308, "rewards/length2tails_reward/mean": 0.6243403553962708, "rewards/length2tails_reward/std": 0.3703277111053467, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.911745548248291, "rewards/thermo_reward/std": 2.5675079822540283, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10209081880748272, "epoch": 0.808, "frac_reward_zero_std": 0.0, "grad_norm": 0.0930095762014389, "learning_rate": 1.8992931207922051e-06, "loss": -0.0076, "num_tokens": 3505127.0, "reward": 10.85621452331543, "reward_std": 5.562469482421875, "rewards/fitness_reward/mean": 6.39210319519043, "rewards/fitness_reward/std": 2.7849199771881104, "rewards/kidney_reward/mean": 1.9636497497558594, "rewards/kidney_reward/std": 1.3856197595596313, "rewards/length2tails_reward/mean": 0.6451936960220337, "rewards/length2tails_reward/std": 0.38377857208251953, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.335941791534424, "rewards/thermo_reward/std": 2.192247152328491, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.46875, "completions/mean_terminated_length": 271.46875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10685683414340019, "epoch": 0.81, "frac_reward_zero_std": 0.0, "grad_norm": 0.40007176995277405, "learning_rate": 1.898731579871945e-06, "loss": -0.0077, "num_tokens": 3513846.0, "reward": 11.469308853149414, "reward_std": 4.350868225097656, "rewards/fitness_reward/mean": 6.99554443359375, "rewards/fitness_reward/std": 1.7628074884414673, "rewards/kidney_reward/mean": 2.1276307106018066, "rewards/kidney_reward/std": 1.243864893913269, "rewards/length2tails_reward/mean": 0.6337643265724182, "rewards/length2tails_reward/std": 0.39094334840774536, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.1827573776245117, "rewards/thermo_reward/std": 2.2920548915863037, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.375, "completions/mean_terminated_length": 269.375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.09846197813749313, "epoch": 0.812, "frac_reward_zero_std": 0.0, "grad_norm": 0.05986107513308525, "learning_rate": 1.898168561213419e-06, "loss": -0.0042, "num_tokens": 3522498.0, "reward": 11.358301162719727, "reward_std": 4.575360298156738, "rewards/fitness_reward/mean": 6.622168064117432, "rewards/fitness_reward/std": 2.699398994445801, "rewards/kidney_reward/mean": 2.2430858612060547, "rewards/kidney_reward/std": 0.880042552947998, "rewards/length2tails_reward/mean": 0.5045244097709656, "rewards/length2tails_reward/std": 0.38825201988220215, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.342595100402832, "rewards/thermo_reward/std": 2.359964370727539, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.46875, "completions/mean_terminated_length": 271.46875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10626904852688313, "epoch": 0.814, "frac_reward_zero_std": 0.0, "grad_norm": 0.049734681844711304, "learning_rate": 1.89760406574237e-06, "loss": -0.0031, "num_tokens": 3531217.0, "reward": 11.872875213623047, "reward_std": 5.345208644866943, "rewards/fitness_reward/mean": 6.691218376159668, "rewards/fitness_reward/std": 2.6453068256378174, "rewards/kidney_reward/mean": 2.2744810581207275, "rewards/kidney_reward/std": 1.1397809982299805, "rewards/length2tails_reward/mean": 0.7228773236274719, "rewards/length2tails_reward/std": 0.28869134187698364, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.7348880767822266, "rewards/thermo_reward/std": 1.9927328824996948, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.3125, "completions/mean_terminated_length": 272.3125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09195891162380576, "epoch": 0.816, "frac_reward_zero_std": 0.0, "grad_norm": 0.42579376697540283, "learning_rate": 1.8970380943869686e-06, "loss": -0.0075, "num_tokens": 3539963.0, "reward": 11.169445037841797, "reward_std": 5.160699844360352, "rewards/fitness_reward/mean": 6.524928092956543, "rewards/fitness_reward/std": 2.845306873321533, "rewards/kidney_reward/mean": 2.268484115600586, "rewards/kidney_reward/std": 1.0355554819107056, "rewards/length2tails_reward/mean": 0.6663411855697632, "rewards/length2tails_reward/std": 0.3943096101284027, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.2093992233276367, "rewards/thermo_reward/std": 2.1658506393432617, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.28125, "completions/mean_terminated_length": 271.28125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10751304030418396, "epoch": 0.818, "frac_reward_zero_std": 0.0, "grad_norm": 0.07456594705581665, "learning_rate": 1.8964706480778127e-06, "loss": -0.0086, "num_tokens": 3548676.0, "reward": 11.711389541625977, "reward_std": 3.914869785308838, "rewards/fitness_reward/mean": 6.703976631164551, "rewards/fitness_reward/std": 2.5914342403411865, "rewards/kidney_reward/mean": 2.4296531677246094, "rewards/kidney_reward/std": 0.5578335523605347, "rewards/length2tails_reward/mean": 0.6096435785293579, "rewards/length2tails_reward/std": 0.3954150378704071, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.4167962074279785, "rewards/thermo_reward/std": 1.7974330186843872, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.96875, "completions/mean_terminated_length": 269.96875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.09460800653323531, "epoch": 0.82, "frac_reward_zero_std": 0.0, "grad_norm": 0.05850926786661148, "learning_rate": 1.8959017277479254e-06, "loss": -0.0032, "num_tokens": 3557347.0, "reward": 10.729347229003906, "reward_std": 6.073248386383057, "rewards/fitness_reward/mean": 5.974654674530029, "rewards/fitness_reward/std": 3.7373361587524414, "rewards/kidney_reward/mean": 2.17683482170105, "rewards/kidney_reward/std": 1.2341636419296265, "rewards/length2tails_reward/mean": 0.5170494318008423, "rewards/length2tails_reward/std": 0.4222230613231659, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.4261536598205566, "rewards/thermo_reward/std": 2.2774081230163574, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.34375, "completions/mean_terminated_length": 270.34375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10826176684349775, "epoch": 0.822, "frac_reward_zero_std": 0.0, "grad_norm": 0.05722399801015854, "learning_rate": 1.8953313343327531e-06, "loss": -0.0024, "num_tokens": 3566030.0, "reward": 11.544689178466797, "reward_std": 5.336467266082764, "rewards/fitness_reward/mean": 6.698562145233154, "rewards/fitness_reward/std": 2.614190101623535, "rewards/kidney_reward/mean": 2.185883045196533, "rewards/kidney_reward/std": 1.2772369384765625, "rewards/length2tails_reward/mean": 0.6247825622558594, "rewards/length2tails_reward/std": 0.3542427718639374, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.4977660179138184, "rewards/thermo_reward/std": 2.0525569915771484, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.78125, "completions/mean_terminated_length": 271.78125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10833274759352207, "epoch": 0.824, "frac_reward_zero_std": 0.0, "grad_norm": 0.07414238899946213, "learning_rate": 1.8947594687701643e-06, "loss": -0.004, "num_tokens": 3574759.0, "reward": 12.215715408325195, "reward_std": 3.1908395290374756, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.303682327270508, "rewards/kidney_reward/std": 0.9398621916770935, "rewards/length2tails_reward/mean": 0.68268221616745, "rewards/length2tails_reward/std": 0.3193387985229492, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.440089225769043, "rewards/thermo_reward/std": 2.3476831912994385, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 271.875, "completions/mean_terminated_length": 271.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11643747333437204, "epoch": 0.826, "frac_reward_zero_std": 0.0, "grad_norm": 0.159814715385437, "learning_rate": 1.8941861320004482e-06, "loss": -0.0062, "num_tokens": 3583491.0, "reward": 12.556082725524902, "reward_std": 2.924164295196533, "rewards/fitness_reward/mean": 7.188657283782959, "rewards/fitness_reward/std": 0.7179933190345764, "rewards/kidney_reward/mean": 2.3408851623535156, "rewards/kidney_reward/std": 0.9379902482032776, "rewards/length2tails_reward/mean": 0.7260844707489014, "rewards/length2tails_reward/std": 0.3202032148838043, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.853931427001953, "rewards/thermo_reward/std": 1.5523806810379028, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.875, "completions/mean_terminated_length": 271.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11242737621068954, "epoch": 0.828, "frac_reward_zero_std": 0.0, "grad_norm": 0.06392483413219452, "learning_rate": 1.8936113249663134e-06, "loss": -0.0045, "num_tokens": 3592223.0, "reward": 12.918121337890625, "reward_std": 2.424635171890259, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4072585105895996, "rewards/kidney_reward/std": 0.6688947677612305, "rewards/length2tails_reward/mean": 0.7507193088531494, "rewards/length2tails_reward/std": 0.28658536076545715, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.974606513977051, "rewards/thermo_reward/std": 1.8415789604187012, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.78125, "completions/mean_terminated_length": 269.78125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10316573828458786, "epoch": 0.83, "frac_reward_zero_std": 0.0, "grad_norm": 0.07275240123271942, "learning_rate": 1.8930350486128855e-06, "loss": -0.0017, "num_tokens": 3600888.0, "reward": 12.80959701538086, "reward_std": 1.577688217163086, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.5662388205528259, "rewards/length2tails_reward/std": 0.3575478494167328, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.7473864555358887, "rewards/thermo_reward/std": 1.4778509140014648, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.15625, "completions/mean_terminated_length": 271.15625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10222028940916061, "epoch": 0.832, "frac_reward_zero_std": 0.0, "grad_norm": 0.08466058224439621, "learning_rate": 1.8924573038877059e-06, "loss": -0.0026, "num_tokens": 3609597.0, "reward": 11.92800521850586, "reward_std": 4.447513103485107, "rewards/fitness_reward/mean": 6.99554443359375, "rewards/fitness_reward/std": 1.7628074884414673, "rewards/kidney_reward/mean": 2.132324695587158, "rewards/kidney_reward/std": 1.3145263195037842, "rewards/length2tails_reward/mean": 0.6122592687606812, "rewards/length2tails_reward/std": 0.39825892448425293, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.6389098167419434, "rewards/thermo_reward/std": 2.201195478439331, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.3125, "completions/mean_terminated_length": 271.3125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11040330212563276, "epoch": 0.834, "frac_reward_zero_std": 0.0, "grad_norm": 0.0990830585360527, "learning_rate": 1.8918780917407306e-06, "loss": -0.0082, "num_tokens": 3618311.0, "reward": 11.632905960083008, "reward_std": 3.737865686416626, "rewards/fitness_reward/mean": 7.010110378265381, "rewards/fitness_reward/std": 1.9859774112701416, "rewards/kidney_reward/mean": 2.2957663536071777, "rewards/kidney_reward/std": 0.7961881756782532, "rewards/length2tails_reward/mean": 0.6458245515823364, "rewards/length2tails_reward/std": 0.3643723130226135, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.1624460220336914, "rewards/thermo_reward/std": 2.50215744972229, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.90625, "completions/mean_terminated_length": 271.90625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10497548617422581, "epoch": 0.836, "frac_reward_zero_std": 0.0, "grad_norm": 0.26866573095321655, "learning_rate": 1.891297413124329e-06, "loss": -0.0002, "num_tokens": 3627044.0, "reward": 11.439022064208984, "reward_std": 5.316377639770508, "rewards/fitness_reward/mean": 6.693381309509277, "rewards/fitness_reward/std": 2.6331963539123535, "rewards/kidney_reward/mean": 2.306674003601074, "rewards/kidney_reward/std": 1.155663251876831, "rewards/length2tails_reward/mean": 0.6525619029998779, "rewards/length2tails_reward/std": 0.3838775157928467, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.273709774017334, "rewards/thermo_reward/std": 1.9739047288894653, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.625, "completions/mean_terminated_length": 271.625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10293256212025881, "epoch": 0.838, "frac_reward_zero_std": 0.0, "grad_norm": 0.45388221740722656, "learning_rate": 1.8907152689932807e-06, "loss": 0.0008, "num_tokens": 3635768.0, "reward": 11.114225387573242, "reward_std": 6.007777214050293, "rewards/fitness_reward/mean": 6.650113105773926, "rewards/fitness_reward/std": 2.7990036010742188, "rewards/kidney_reward/mean": 2.1136951446533203, "rewards/kidney_reward/std": 1.413481593132019, "rewards/length2tails_reward/mean": 0.7098208665847778, "rewards/length2tails_reward/std": 0.31065788865089417, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.1794347763061523, "rewards/thermo_reward/std": 2.332662343978882, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.5, "completions/mean_terminated_length": 271.5, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10914828535169363, "epoch": 0.84, "frac_reward_zero_std": 0.0, "grad_norm": 0.17354260385036469, "learning_rate": 1.890131660304776e-06, "loss": 0.0035, "num_tokens": 3644488.0, "reward": 12.097772598266602, "reward_std": 4.210140228271484, "rewards/fitness_reward/mean": 6.984278202056885, "rewards/fitness_reward/std": 1.8254834413528442, "rewards/kidney_reward/mean": 2.3367323875427246, "rewards/kidney_reward/std": 0.9451751112937927, "rewards/length2tails_reward/mean": 0.648028552532196, "rewards/length2tails_reward/std": 0.35677388310432434, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.611959934234619, "rewards/thermo_reward/std": 1.979071021080017, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.53125, "completions/mean_terminated_length": 270.53125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11126773897558451, "epoch": 0.842, "frac_reward_zero_std": 0.0, "grad_norm": 0.11031896620988846, "learning_rate": 1.8895465880184118e-06, "loss": -0.009, "num_tokens": 3653177.0, "reward": 11.771194458007812, "reward_std": 4.116332054138184, "rewards/fitness_reward/mean": 6.938035011291504, "rewards/fitness_reward/std": 1.78042471408844, "rewards/kidney_reward/mean": 2.236675262451172, "rewards/kidney_reward/std": 0.9763497710227966, "rewards/length2tails_reward/mean": 0.6223554611206055, "rewards/length2tails_reward/std": 0.3492722809314728, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.4342474937438965, "rewards/thermo_reward/std": 2.1130752563476562, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.375, "completions/mean_terminated_length": 273.375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.10812357719987631, "epoch": 0.844, "frac_reward_zero_std": 0.0, "grad_norm": 0.08260457962751389, "learning_rate": 1.8889600530961932e-06, "loss": -0.0046, "num_tokens": 3661957.0, "reward": 11.629140853881836, "reward_std": 5.307247161865234, "rewards/fitness_reward/mean": 6.631275177001953, "rewards/fitness_reward/std": 2.872642993927002, "rewards/kidney_reward/mean": 2.2064032554626465, "rewards/kidney_reward/std": 1.2581707239151, "rewards/length2tails_reward/mean": 0.8156726956367493, "rewards/length2tails_reward/std": 0.2643541395664215, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.6098947525024414, "rewards/thermo_reward/std": 2.1621108055114746, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.125, "completions/mean_terminated_length": 270.125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09654233139008284, "epoch": 0.846, "frac_reward_zero_std": 0.0, "grad_norm": 0.10825493186712265, "learning_rate": 1.8883720565025295e-06, "loss": -0.0025, "num_tokens": 3670633.0, "reward": 12.276029586791992, "reward_std": 3.112684965133667, "rewards/fitness_reward/mean": 7.188657283782959, "rewards/fitness_reward/std": 0.7179933190345764, "rewards/kidney_reward/mean": 2.2949612140655518, "rewards/kidney_reward/std": 0.9728565216064453, "rewards/length2tails_reward/mean": 0.5613535642623901, "rewards/length2tails_reward/std": 0.39826834201812744, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.6362757682800293, "rewards/thermo_reward/std": 1.831773281097412, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.59375, "completions/mean_terminated_length": 269.59375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.1035864856094122, "epoch": 0.848, "frac_reward_zero_std": 0.0, "grad_norm": 0.05640365555882454, "learning_rate": 1.8877825992042328e-06, "loss": -0.0017, "num_tokens": 3679292.0, "reward": 11.372427940368652, "reward_std": 5.877617359161377, "rewards/fitness_reward/mean": 6.392179489135742, "rewards/fitness_reward/std": 3.06811785697937, "rewards/kidney_reward/mean": 2.106853485107422, "rewards/kidney_reward/std": 1.2987583875656128, "rewards/length2tails_reward/mean": 0.5478678941726685, "rewards/length2tails_reward/std": 0.38686081767082214, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 2.724857807159424, "rewards/thermo_reward/std": 1.903943419456482, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 270.34375, "completions/mean_terminated_length": 270.34375, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.11908046249300241, "epoch": 0.85, "frac_reward_zero_std": 0.0, "grad_norm": 0.3277480900287628, "learning_rate": 1.8871916821705183e-06, "loss": -0.02, "num_tokens": 3687975.0, "reward": 10.093684196472168, "reward_std": 7.310795783996582, "rewards/fitness_reward/mean": 6.2110700607299805, "rewards/fitness_reward/std": 3.644062042236328, "rewards/kidney_reward/mean": 1.734727144241333, "rewards/kidney_reward/std": 1.906247854232788, "rewards/length2tails_reward/mean": 0.6923362016677856, "rewards/length2tails_reward/std": 0.3460744321346283, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.9786536693572998, "rewards/thermo_reward/std": 2.5642237663269043, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.78125, "completions/mean_terminated_length": 270.78125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10314168641343713, "epoch": 0.852, "frac_reward_zero_std": 0.0, "grad_norm": 0.08059486001729965, "learning_rate": 1.8865993063730002e-06, "loss": 0.0004, "num_tokens": 3696672.0, "reward": 9.494028091430664, "reward_std": 7.22643518447876, "rewards/fitness_reward/mean": 6.156418323516846, "rewards/fitness_reward/std": 3.2921860218048096, "rewards/kidney_reward/mean": 1.6028504371643066, "rewards/kidney_reward/std": 1.8587509393692017, "rewards/length2tails_reward/mean": 0.5948251485824585, "rewards/length2tails_reward/std": 0.38797110319137573, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.5752768516540527, "rewards/thermo_reward/std": 2.861532211303711, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.15625, "completions/mean_terminated_length": 272.15625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10947284288704395, "epoch": 0.854, "frac_reward_zero_std": 0.0, "grad_norm": 0.24476461112499237, "learning_rate": 1.8860054727856914e-06, "loss": -0.0029, "num_tokens": 3705413.0, "reward": 11.377592086791992, "reward_std": 5.323113441467285, "rewards/fitness_reward/mean": 6.946459770202637, "rewards/fitness_reward/std": 2.0363516807556152, "rewards/kidney_reward/mean": 2.0672290325164795, "rewards/kidney_reward/std": 1.5073118209838867, "rewards/length2tails_reward/mean": 0.755418598651886, "rewards/length2tails_reward/std": 0.28897804021835327, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.1883621215820312, "rewards/thermo_reward/std": 2.3532443046569824, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.78125, "completions/mean_terminated_length": 272.78125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.1341212745755911, "epoch": 0.856, "frac_reward_zero_std": 0.0, "grad_norm": 0.10159477591514587, "learning_rate": 1.8854101823850024e-06, "loss": 0.0015, "num_tokens": 3714174.0, "reward": 11.991580963134766, "reward_std": 5.1352057456970215, "rewards/fitness_reward/mean": 7.010758399963379, "rewards/fitness_reward/std": 1.9823126792907715, "rewards/kidney_reward/mean": 2.2044568061828613, "rewards/kidney_reward/std": 1.312869668006897, "rewards/length2tails_reward/mean": 0.7862224578857422, "rewards/length2tails_reward/std": 0.2628347873687744, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.5977439880371094, "rewards/thermo_reward/std": 2.285738229751587, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11356043815612793, "epoch": 0.858, "frac_reward_zero_std": 0.0, "grad_norm": 0.09840331226587296, "learning_rate": 1.8848134361497382e-06, "loss": -0.0028, "num_tokens": 3722890.0, "reward": 11.219654083251953, "reward_std": 4.991817951202393, "rewards/fitness_reward/mean": 6.613970756530762, "rewards/fitness_reward/std": 2.7284114360809326, "rewards/kidney_reward/mean": 2.212526559829712, "rewards/kidney_reward/std": 1.1607743501663208, "rewards/length2tails_reward/mean": 0.6491613984107971, "rewards/length2tails_reward/std": 0.35882139205932617, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.2282400131225586, "rewards/thermo_reward/std": 2.2548983097076416, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 271.3125, "completions/mean_terminated_length": 271.3125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11092049349099398, "epoch": 0.86, "frac_reward_zero_std": 0.0, "grad_norm": 0.18348531424999237, "learning_rate": 1.884215235061099e-06, "loss": -0.006, "num_tokens": 3731604.0, "reward": 9.037184715270996, "reward_std": 6.7496137619018555, "rewards/fitness_reward/mean": 5.8980865478515625, "rewards/fitness_reward/std": 3.6536824703216553, "rewards/kidney_reward/mean": 1.7923259735107422, "rewards/kidney_reward/std": 1.5743216276168823, "rewards/length2tails_reward/mean": 0.6611948609352112, "rewards/length2tails_reward/std": 0.36291196942329407, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 1.1869029998779297, "rewards/thermo_reward/std": 2.8278138637542725, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.6875, "completions/mean_terminated_length": 272.6875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10679224226623774, "epoch": 0.862, "frac_reward_zero_std": 0.0, "grad_norm": 0.059091147035360336, "learning_rate": 1.8836155801026753e-06, "loss": -0.0057, "num_tokens": 3740362.0, "reward": 10.398284912109375, "reward_std": 6.299036502838135, "rewards/fitness_reward/mean": 6.153982639312744, "rewards/fitness_reward/std": 3.443929433822632, "rewards/kidney_reward/mean": 1.9206444025039673, "rewards/kidney_reward/std": 1.4578499794006348, "rewards/length2tails_reward/mean": 0.7474456429481506, "rewards/length2tails_reward/std": 0.3091128468513489, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.148913860321045, "rewards/thermo_reward/std": 2.431601047515869, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11343036219477654, "epoch": 0.864, "frac_reward_zero_std": 0.0, "grad_norm": 0.05083763599395752, "learning_rate": 1.883014472260449e-06, "loss": 0.002, "num_tokens": 3749106.0, "reward": 12.751008987426758, "reward_std": 4.422357082366943, "rewards/fitness_reward/mean": 7.004498481750488, "rewards/fitness_reward/std": 2.01772403717041, "rewards/kidney_reward/mean": 2.3845763206481934, "rewards/kidney_reward/std": 1.0707193613052368, "rewards/length2tails_reward/mean": 0.7510284781455994, "rewards/length2tails_reward/std": 0.30357375741004944, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.186831474304199, "rewards/thermo_reward/std": 1.4339048862457275, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.375, "completions/mean_terminated_length": 269.375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10790925193578005, "epoch": 0.866, "frac_reward_zero_std": 0.0, "grad_norm": 0.052672725170850754, "learning_rate": 1.8824119125227917e-06, "loss": -0.0052, "num_tokens": 3757758.0, "reward": 12.130245208740234, "reward_std": 3.5245840549468994, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.4843716621398926, "rewards/kidney_reward/std": 0.5299732089042664, "rewards/length2tails_reward/mean": 0.5352551937103271, "rewards/length2tails_reward/std": 0.34461134672164917, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.4392945766448975, "rewards/thermo_reward/std": 2.1182706356048584, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.53125, "completions/mean_terminated_length": 271.53125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11035188753157854, "epoch": 0.868, "frac_reward_zero_std": 0.0, "grad_norm": 0.10949714481830597, "learning_rate": 1.881807901880461e-06, "loss": 0.0003, "num_tokens": 3766479.0, "reward": 11.700475692749023, "reward_std": 4.995792865753174, "rewards/fitness_reward/mean": 6.950691223144531, "rewards/fitness_reward/std": 2.012726306915283, "rewards/kidney_reward/mean": 2.1191606521606445, "rewards/kidney_reward/std": 1.2957903146743774, "rewards/length2tails_reward/mean": 0.6825649738311768, "rewards/length2tails_reward/std": 0.33565008640289307, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.4623684883117676, "rewards/thermo_reward/std": 2.338465929031372, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 270.375, "completions/mean_terminated_length": 270.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11510983109474182, "epoch": 0.87, "frac_reward_zero_std": 0.0, "grad_norm": 0.09583006799221039, "learning_rate": 1.8812024413266004e-06, "loss": -0.0006, "num_tokens": 3775163.0, "reward": 10.369340896606445, "reward_std": 6.81846809387207, "rewards/fitness_reward/mean": 6.297909736633301, "rewards/fitness_reward/std": 3.1902825832366943, "rewards/kidney_reward/mean": 1.9326624870300293, "rewards/kidney_reward/std": 1.5834283828735352, "rewards/length2tails_reward/mean": 0.6140122413635254, "rewards/length2tails_reward/std": 0.34732601046562195, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.9773672819137573, "rewards/thermo_reward/std": 2.6822171211242676, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.4375, "completions/mean_terminated_length": 272.4375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.12347112130373716, "epoch": 0.872, "frac_reward_zero_std": 0.0, "grad_norm": 0.08026017993688583, "learning_rate": 1.8805955318567379e-06, "loss": -0.0037, "num_tokens": 3783913.0, "reward": 11.330899238586426, "reward_std": 3.2819528579711914, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.019801616668701, "rewards/kidney_reward/std": 1.1881566047668457, "rewards/length2tails_reward/mean": 0.6870349049568176, "rewards/length2tails_reward/std": 0.3882122337818146, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.7812089920043945, "rewards/thermo_reward/std": 2.2675986289978027, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.03125, "completions/mean_terminated_length": 271.03125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10257836803793907, "epoch": 0.874, "frac_reward_zero_std": 0.0, "grad_norm": 0.06457014381885529, "learning_rate": 1.8799871744687837e-06, "loss": -0.0016, "num_tokens": 3792618.0, "reward": 11.247159957885742, "reward_std": 5.1798176765441895, "rewards/fitness_reward/mean": 6.952747344970703, "rewards/fitness_reward/std": 2.001246929168701, "rewards/kidney_reward/mean": 2.117042303085327, "rewards/kidney_reward/std": 1.360007405281067, "rewards/length2tails_reward/mean": 0.6416828632354736, "rewards/length2tails_reward/std": 0.35022541880607605, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.013201951980591, "rewards/thermo_reward/std": 2.656627655029297, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.0625, "completions/mean_terminated_length": 271.0625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10436887294054031, "epoch": 0.876, "frac_reward_zero_std": 0.0, "grad_norm": 0.07170294970273972, "learning_rate": 1.8793773701630285e-06, "loss": -0.0082, "num_tokens": 3801324.0, "reward": 11.48604965209961, "reward_std": 4.0768280029296875, "rewards/fitness_reward/mean": 6.872337341308594, "rewards/fitness_reward/std": 2.1419677734375, "rewards/kidney_reward/mean": 2.178575038909912, "rewards/kidney_reward/std": 0.9928116202354431, "rewards/length2tails_reward/mean": 0.6542539596557617, "rewards/length2tails_reward/std": 0.36290809512138367, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.269711494445801, "rewards/thermo_reward/std": 2.394803524017334, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.53125, "completions/mean_terminated_length": 270.53125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10206220205873251, "epoch": 0.878, "frac_reward_zero_std": 0.0, "grad_norm": 0.499766081571579, "learning_rate": 1.8787661199421426e-06, "loss": -0.0035, "num_tokens": 3810013.0, "reward": 10.417783737182617, "reward_std": 6.227560520172119, "rewards/fitness_reward/mean": 6.27559757232666, "rewards/fitness_reward/std": 3.4502835273742676, "rewards/kidney_reward/mean": 2.142082691192627, "rewards/kidney_reward/std": 1.3962165117263794, "rewards/length2tails_reward/mean": 0.6051323413848877, "rewards/length2tails_reward/std": 0.37149810791015625, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.8395893573760986, "rewards/thermo_reward/std": 2.511199474334717, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.8125, "completions/mean_terminated_length": 271.8125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10686526820063591, "epoch": 0.88, "frac_reward_zero_std": 0.0, "grad_norm": 0.06103542819619179, "learning_rate": 1.8781534248111729e-06, "loss": -0.0041, "num_tokens": 3818743.0, "reward": 12.375520706176758, "reward_std": 3.433697462081909, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.36761474609375, "rewards/kidney_reward/std": 0.7219849228858948, "rewards/length2tails_reward/mean": 0.7102646827697754, "rewards/length2tails_reward/std": 0.3061424493789673, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.7838258743286133, "rewards/thermo_reward/std": 1.7129298448562622, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10807423712685704, "epoch": 0.882, "frac_reward_zero_std": 0.0, "grad_norm": 0.08479100465774536, "learning_rate": 1.877539285777543e-06, "loss": -0.0083, "num_tokens": 3827471.0, "reward": 11.815866470336914, "reward_std": 5.167867183685303, "rewards/fitness_reward/mean": 6.632122039794922, "rewards/fitness_reward/std": 2.65669584274292, "rewards/kidney_reward/mean": 2.2621984481811523, "rewards/kidney_reward/std": 1.1790883541107178, "rewards/length2tails_reward/mean": 0.6838822960853577, "rewards/length2tails_reward/std": 0.3464617431163788, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.7531585693359375, "rewards/thermo_reward/std": 1.9235625267028809, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10133147519081831, "epoch": 0.884, "frac_reward_zero_std": 0.0, "grad_norm": 0.07655313611030579, "learning_rate": 1.8769237038510499e-06, "loss": -0.0091, "num_tokens": 3836187.0, "reward": 12.801630020141602, "reward_std": 2.2215499877929688, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.464538097381592, "rewards/kidney_reward/std": 0.6341493725776672, "rewards/length2tails_reward/mean": 0.6683378219604492, "rewards/length2tails_reward/std": 0.3719833195209503, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.809072971343994, "rewards/thermo_reward/std": 1.7491624355316162, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.65625, "completions/mean_terminated_length": 271.65625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11100908927619457, "epoch": 0.886, "frac_reward_zero_std": 0.0, "grad_norm": 0.06725804507732391, "learning_rate": 1.8763066800438634e-06, "loss": -0.0025, "num_tokens": 3844912.0, "reward": 11.207784652709961, "reward_std": 5.085540294647217, "rewards/fitness_reward/mean": 6.943241119384766, "rewards/fitness_reward/std": 2.0543272495269775, "rewards/kidney_reward/mean": 2.169588804244995, "rewards/kidney_reward/std": 1.2773592472076416, "rewards/length2tails_reward/mean": 0.6841601133346558, "rewards/length2tails_reward/std": 0.33557799458503723, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.9265384674072266, "rewards/thermo_reward/std": 2.5070431232452393, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.21875, "completions/mean_terminated_length": 271.21875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10447764955461025, "epoch": 0.888, "frac_reward_zero_std": 0.0, "grad_norm": 0.06931206583976746, "learning_rate": 1.8756882153705246e-06, "loss": -0.0056, "num_tokens": 3853623.0, "reward": 12.738012313842773, "reward_std": 2.0027472972869873, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4896838665008545, "rewards/kidney_reward/std": 0.3229711949825287, "rewards/length2tails_reward/mean": 0.6159868836402893, "rewards/length2tails_reward/std": 0.3990537226200104, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.7255449295043945, "rewards/thermo_reward/std": 1.750189185142517, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.9375, "completions/mean_terminated_length": 270.9375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10720862448215485, "epoch": 0.89, "frac_reward_zero_std": 0.0, "grad_norm": 0.0804607942700386, "learning_rate": 1.875068310847943e-06, "loss": -0.0008, "num_tokens": 3862325.0, "reward": 12.044584274291992, "reward_std": 4.262816905975342, "rewards/fitness_reward/mean": 6.969001293182373, "rewards/fitness_reward/std": 1.910581111907959, "rewards/kidney_reward/mean": 2.3637166023254395, "rewards/kidney_reward/std": 0.8406797647476196, "rewards/length2tails_reward/mean": 0.7061450481414795, "rewards/length2tails_reward/std": 0.27908092737197876, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.5412521362304688, "rewards/thermo_reward/std": 2.0376970767974854, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.125, "completions/mean_terminated_length": 270.125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09836816508322954, "epoch": 0.892, "frac_reward_zero_std": 0.0, "grad_norm": 0.07819455862045288, "learning_rate": 1.8744469674953955e-06, "loss": -0.0041, "num_tokens": 3871001.0, "reward": 9.899242401123047, "reward_std": 6.037757873535156, "rewards/fitness_reward/mean": 6.267643928527832, "rewards/fitness_reward/std": 3.281162738800049, "rewards/kidney_reward/mean": 1.7312264442443848, "rewards/kidney_reward/std": 1.5176829099655151, "rewards/length2tails_reward/mean": 0.5764222145080566, "rewards/length2tails_reward/std": 0.3620404303073883, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.7427294254302979, "rewards/thermo_reward/std": 2.74582576751709, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.15625, "completions/mean_terminated_length": 271.15625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09943460021167994, "epoch": 0.894, "frac_reward_zero_std": 0.0, "grad_norm": 0.15047623217105865, "learning_rate": 1.873824186334526e-06, "loss": -0.0073, "num_tokens": 3879710.0, "reward": 11.746528625488281, "reward_std": 4.381689071655273, "rewards/fitness_reward/mean": 7.131148338317871, "rewards/fitness_reward/std": 0.905185341835022, "rewards/kidney_reward/mean": 2.1221461296081543, "rewards/kidney_reward/std": 1.507962703704834, "rewards/length2tails_reward/mean": 0.6461377143859863, "rewards/length2tails_reward/std": 0.3861064612865448, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.3286209106445312, "rewards/thermo_reward/std": 2.35248064994812, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.96875, "completions/mean_terminated_length": 271.96875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11172295361757278, "epoch": 0.896, "frac_reward_zero_std": 0.0, "grad_norm": 0.06506041437387466, "learning_rate": 1.8731999683893402e-06, "loss": -0.0047, "num_tokens": 3888445.0, "reward": 12.793821334838867, "reward_std": 3.550959348678589, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.3735175132751465, "rewards/kidney_reward/std": 0.917577862739563, "rewards/length2tails_reward/mean": 0.7215828895568848, "rewards/length2tails_reward/std": 0.27914682030677795, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.19509220123291, "rewards/thermo_reward/std": 1.4694974422454834, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.8125, "completions/mean_terminated_length": 270.8125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.1053133811801672, "epoch": 0.898, "frac_reward_zero_std": 0.0, "grad_norm": 0.08816291391849518, "learning_rate": 1.8725743146862092e-06, "loss": 0.0001, "num_tokens": 3897143.0, "reward": 11.757261276245117, "reward_std": 5.4524030685424805, "rewards/fitness_reward/mean": 6.697388172149658, "rewards/fitness_reward/std": 2.6181633472442627, "rewards/kidney_reward/mean": 2.312431812286377, "rewards/kidney_reward/std": 1.1747411489486694, "rewards/length2tails_reward/mean": 0.6832621097564697, "rewards/length2tails_reward/std": 0.32060670852661133, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.5791144371032715, "rewards/thermo_reward/std": 2.3686468601226807, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.78125, "completions/mean_terminated_length": 271.78125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10911222640424967, "epoch": 0.9, "frac_reward_zero_std": 0.0, "grad_norm": 0.053479693830013275, "learning_rate": 1.8719472262538622e-06, "loss": 0.0004, "num_tokens": 3905872.0, "reward": 12.228736877441406, "reward_std": 4.497715950012207, "rewards/fitness_reward/mean": 7.005762577056885, "rewards/fitness_reward/std": 2.010572910308838, "rewards/kidney_reward/mean": 2.346081495285034, "rewards/kidney_reward/std": 0.9915664196014404, "rewards/length2tails_reward/mean": 0.6802853941917419, "rewards/length2tails_reward/std": 0.34215471148490906, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.708864688873291, "rewards/thermo_reward/std": 2.0118680000305176, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 265.15625, "completions/mean_terminated_length": 265.15625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.10868939571082592, "epoch": 0.902, "frac_reward_zero_std": 0.0, "grad_norm": 0.2735053300857544, "learning_rate": 1.8713187041233893e-06, "loss": -0.0532, "num_tokens": 3914389.0, "reward": 9.818794250488281, "reward_std": 7.534875392913818, "rewards/fitness_reward/mean": 6.295016765594482, "rewards/fitness_reward/std": 3.373291492462158, "rewards/kidney_reward/mean": 1.7130271196365356, "rewards/kidney_reward/std": 1.9210028648376465, "rewards/length2tails_reward/mean": 0.6132369041442871, "rewards/length2tails_reward/std": 0.3320391774177551, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.6494271755218506, "rewards/thermo_reward/std": 3.0261270999908447, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.0, "completions/mean_terminated_length": 271.0, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11821938399225473, "epoch": 0.904, "frac_reward_zero_std": 0.0, "grad_norm": 0.1313985288143158, "learning_rate": 1.8706887493282366e-06, "loss": 0.004, "num_tokens": 3923093.0, "reward": 10.177400588989258, "reward_std": 7.999275207519531, "rewards/fitness_reward/mean": 6.244592666625977, "rewards/fitness_reward/std": 3.542152166366577, "rewards/kidney_reward/mean": 1.8024885654449463, "rewards/kidney_reward/std": 2.027366876602173, "rewards/length2tails_reward/mean": 0.6615628600120544, "rewards/length2tails_reward/std": 0.3624088764190674, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.9641624689102173, "rewards/thermo_reward/std": 2.8660004138946533, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.71875, "completions/mean_terminated_length": 270.71875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10027664620429277, "epoch": 0.906, "frac_reward_zero_std": 0.0, "grad_norm": 0.07662325352430344, "learning_rate": 1.870057362904207e-06, "loss": -0.0087, "num_tokens": 3931788.0, "reward": 11.619586944580078, "reward_std": 3.6709110736846924, "rewards/fitness_reward/mean": 6.9872660636901855, "rewards/fitness_reward/std": 2.1152055263519287, "rewards/kidney_reward/mean": 2.3769898414611816, "rewards/kidney_reward/std": 0.6920126080513, "rewards/length2tails_reward/mean": 0.5850013494491577, "rewards/length2tails_reward/std": 0.3896300792694092, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.0968310832977295, "rewards/thermo_reward/std": 2.492100477218628, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.5625, "completions/mean_terminated_length": 269.5625, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 0.1027941033244133, "epoch": 0.908, "frac_reward_zero_std": 0.0, "grad_norm": 0.073139987885952, "learning_rate": 1.8694245458894566e-06, "loss": 0.0003, "num_tokens": 3940446.0, "reward": 10.70131778717041, "reward_std": 5.772634506225586, "rewards/fitness_reward/mean": 6.5864458084106445, "rewards/fitness_reward/std": 2.652723550796509, "rewards/kidney_reward/mean": 2.006622552871704, "rewards/kidney_reward/std": 1.4546256065368652, "rewards/length2tails_reward/mean": 0.5887947082519531, "rewards/length2tails_reward/std": 0.41531050205230713, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.9493696689605713, "rewards/thermo_reward/std": 2.536822557449341, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.53125, "completions/mean_terminated_length": 271.53125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.12032953463494778, "epoch": 0.91, "frac_reward_zero_std": 0.0, "grad_norm": 0.10334499925374985, "learning_rate": 1.8687902993244942e-06, "loss": -0.0033, "num_tokens": 3949167.0, "reward": 10.728096008300781, "reward_std": 6.511604309082031, "rewards/fitness_reward/mean": 6.469539642333984, "rewards/fitness_reward/std": 2.875175952911377, "rewards/kidney_reward/mean": 2.025391101837158, "rewards/kidney_reward/std": 1.6085196733474731, "rewards/length2tails_reward/mean": 0.6913998126983643, "rewards/length2tails_reward/std": 0.308725506067276, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.0640251636505127, "rewards/thermo_reward/std": 2.5514793395996094, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.15625, "completions/mean_terminated_length": 271.15625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11203371826559305, "epoch": 0.912, "frac_reward_zero_std": 0.0, "grad_norm": 0.09344188123941422, "learning_rate": 1.8681546242521783e-06, "loss": 0.0019, "num_tokens": 3957876.0, "reward": 9.808435440063477, "reward_std": 6.910869598388672, "rewards/fitness_reward/mean": 6.19857120513916, "rewards/fitness_reward/std": 3.487494945526123, "rewards/kidney_reward/mean": 1.8561313152313232, "rewards/kidney_reward/std": 1.8377562761306763, "rewards/length2tails_reward/mean": 0.6736408472061157, "rewards/length2tails_reward/std": 0.3606426417827606, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.5863685607910156, "rewards/thermo_reward/std": 2.6774237155914307, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.96875, "completions/mean_terminated_length": 271.96875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.1099884919822216, "epoch": 0.914, "frac_reward_zero_std": 0.0, "grad_norm": 0.1273649036884308, "learning_rate": 1.8675175217177175e-06, "loss": -0.0006, "num_tokens": 3966611.0, "reward": 10.460234642028809, "reward_std": 6.559492111206055, "rewards/fitness_reward/mean": 6.5706024169921875, "rewards/fitness_reward/std": 2.889678716659546, "rewards/kidney_reward/mean": 1.9447031021118164, "rewards/kidney_reward/std": 1.5729787349700928, "rewards/length2tails_reward/mean": 0.7160842418670654, "rewards/length2tails_reward/std": 0.3384973108768463, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.7733216285705566, "rewards/thermo_reward/std": 2.7137043476104736, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.875, "completions/mean_terminated_length": 269.875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10272444318979979, "epoch": 0.916, "frac_reward_zero_std": 0.0, "grad_norm": 0.060355983674526215, "learning_rate": 1.8668789927686666e-06, "loss": -0.0059, "num_tokens": 3975279.0, "reward": 11.540103912353516, "reward_std": 3.569697618484497, "rewards/fitness_reward/mean": 6.937399864196777, "rewards/fitness_reward/std": 1.8441245555877686, "rewards/kidney_reward/mean": 2.296140432357788, "rewards/kidney_reward/std": 0.7964839935302734, "rewards/length2tails_reward/mean": 0.5815209746360779, "rewards/length2tails_reward/std": 0.32663068175315857, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.148411273956299, "rewards/thermo_reward/std": 2.218916654586792, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 272.15625, "completions/mean_terminated_length": 272.15625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10958459321409464, "epoch": 0.918, "frac_reward_zero_std": 0.0, "grad_norm": 0.0612795390188694, "learning_rate": 1.866239038454926e-06, "loss": -0.0016, "num_tokens": 3984020.0, "reward": 12.074222564697266, "reward_std": 4.837175369262695, "rewards/fitness_reward/mean": 7.007224082946777, "rewards/fitness_reward/std": 2.002307653427124, "rewards/kidney_reward/mean": 2.2945916652679443, "rewards/kidney_reward/std": 1.1536694765090942, "rewards/length2tails_reward/mean": 0.7487285733222961, "rewards/length2tails_reward/std": 0.2750574052333832, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.5975348949432373, "rewards/thermo_reward/std": 2.3417248725891113, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.75, "completions/mean_terminated_length": 269.75, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.09708154760301113, "epoch": 0.92, "frac_reward_zero_std": 0.0, "grad_norm": 0.14464029669761658, "learning_rate": 1.8655976598287392e-06, "loss": -0.0104, "num_tokens": 3992684.0, "reward": 11.042967796325684, "reward_std": 5.450899124145508, "rewards/fitness_reward/mean": 6.508005142211914, "rewards/fitness_reward/std": 2.7292606830596924, "rewards/kidney_reward/mean": 2.1265482902526855, "rewards/kidney_reward/std": 1.3590309619903564, "rewards/length2tails_reward/mean": 0.5516175627708435, "rewards/length2tails_reward/std": 0.41006430983543396, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.2532529830932617, "rewards/thermo_reward/std": 2.283200740814209, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 270.40625, "completions/mean_terminated_length": 270.40625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10907396487891674, "epoch": 0.922, "frac_reward_zero_std": 0.0, "grad_norm": 0.07392656803131104, "learning_rate": 1.8649548579446935e-06, "loss": -0.0046, "num_tokens": 4001369.0, "reward": 11.953383445739746, "reward_std": 3.702449321746826, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.452592670917511, "rewards/kidney_reward/mean": 2.1486942768096924, "rewards/kidney_reward/std": 1.3303357362747192, "rewards/length2tails_reward/mean": 0.6268943548202515, "rewards/length2tails_reward/std": 0.36906784772872925, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.3958330154418945, "rewards/thermo_reward/std": 2.2485578060150146, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.4375, "completions/mean_terminated_length": 271.4375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10220075491815805, "epoch": 0.924, "frac_reward_zero_std": 0.0, "grad_norm": 0.07153013348579407, "learning_rate": 1.864310633859714e-06, "loss": -0.0018, "num_tokens": 4010087.0, "reward": 11.94033145904541, "reward_std": 4.323536396026611, "rewards/fitness_reward/mean": 6.880142688751221, "rewards/fitness_reward/std": 1.8578017950057983, "rewards/kidney_reward/mean": 2.155754804611206, "rewards/kidney_reward/std": 0.9834696054458618, "rewards/length2tails_reward/mean": 0.6571721434593201, "rewards/length2tails_reward/std": 0.33760565519332886, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.7387170791625977, "rewards/thermo_reward/std": 2.0518715381622314, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.0625, "completions/mean_terminated_length": 273.0625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10953015740960836, "epoch": 0.926, "frac_reward_zero_std": 0.0, "grad_norm": 0.08538804203271866, "learning_rate": 1.863664988633066e-06, "loss": -0.0035, "num_tokens": 4018857.0, "reward": 10.794061660766602, "reward_std": 5.991631507873535, "rewards/fitness_reward/mean": 6.6039018630981445, "rewards/fitness_reward/std": 2.7577898502349854, "rewards/kidney_reward/mean": 2.080371856689453, "rewards/kidney_reward/std": 1.446594476699829, "rewards/length2tails_reward/mean": 0.7289041876792908, "rewards/length2tails_reward/std": 0.3512548506259918, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.9368979930877686, "rewards/thermo_reward/std": 2.4265477657318115, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 270.9375, "completions/mean_terminated_length": 270.9375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11466868966817856, "epoch": 0.928, "frac_reward_zero_std": 0.0, "grad_norm": 0.05370505526661873, "learning_rate": 1.86301792332635e-06, "loss": 0.0004, "num_tokens": 4027559.0, "reward": 12.159685134887695, "reward_std": 4.429521560668945, "rewards/fitness_reward/mean": 6.945412635803223, "rewards/fitness_reward/std": 2.042198896408081, "rewards/kidney_reward/mean": 2.3425214290618896, "rewards/kidney_reward/std": 1.0108206272125244, "rewards/length2tails_reward/mean": 0.6667732000350952, "rewards/length2tails_reward/std": 0.3295728862285614, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.7050743103027344, "rewards/thermo_reward/std": 1.7441182136535645, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10749402642250061, "epoch": 0.93, "frac_reward_zero_std": 0.0, "grad_norm": 0.06493767350912094, "learning_rate": 1.8623694390035035e-06, "loss": -0.0022, "num_tokens": 4036287.0, "reward": 11.909021377563477, "reward_std": 4.883798599243164, "rewards/fitness_reward/mean": 7.002554893493652, "rewards/fitness_reward/std": 2.0287187099456787, "rewards/kidney_reward/mean": 2.2224979400634766, "rewards/kidney_reward/std": 1.2641041278839111, "rewards/length2tails_reward/mean": 0.6873390674591064, "rewards/length2tails_reward/std": 0.3392084836959839, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.51523494720459, "rewards/thermo_reward/std": 2.2844624519348145, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.4375, "completions/mean_terminated_length": 272.4375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.111768938601017, "epoch": 0.932, "frac_reward_zero_std": 0.0, "grad_norm": 0.14316429197788239, "learning_rate": 1.8617195367307949e-06, "loss": -0.0057, "num_tokens": 4045037.0, "reward": 12.788331985473633, "reward_std": 3.252232551574707, "rewards/fitness_reward/mean": 6.863459587097168, "rewards/fitness_reward/std": 2.2403881549835205, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.7575904130935669, "rewards/length2tails_reward/std": 0.31032228469848633, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.204712152481079, "rewards/thermo_reward/std": 0.9641280770301819, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.3125, "completions/mean_terminated_length": 271.3125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11796944122761488, "epoch": 0.934, "frac_reward_zero_std": 0.0, "grad_norm": 0.09694632142782211, "learning_rate": 1.8610682175768257e-06, "loss": -0.0057, "num_tokens": 4053751.0, "reward": 12.549504280090332, "reward_std": 2.293038845062256, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.499530553817749, "rewards/kidney_reward/std": 0.5794037580490112, "rewards/length2tails_reward/mean": 0.7065576314926147, "rewards/length2tails_reward/std": 0.29977843165397644, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.5756421089172363, "rewards/thermo_reward/std": 1.6526124477386475, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.0625, "completions/mean_terminated_length": 273.0625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11116079892963171, "epoch": 0.936, "frac_reward_zero_std": 0.0, "grad_norm": 0.09160441905260086, "learning_rate": 1.8604154826125268e-06, "loss": -0.004, "num_tokens": 4062521.0, "reward": 11.810023307800293, "reward_std": 5.807432651519775, "rewards/fitness_reward/mean": 6.940349102020264, "rewards/fitness_reward/std": 2.380608320236206, "rewards/kidney_reward/mean": 2.253354072570801, "rewards/kidney_reward/std": 1.6274490356445312, "rewards/length2tails_reward/mean": 0.7608587741851807, "rewards/length2tails_reward/std": 0.31576141715049744, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.4402339458465576, "rewards/thermo_reward/std": 2.2192366123199463, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.9375, "completions/mean_terminated_length": 273.9375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09761633444577456, "epoch": 0.938, "frac_reward_zero_std": 0.0, "grad_norm": 0.05027596279978752, "learning_rate": 1.8597613329111566e-06, "loss": -0.0023, "num_tokens": 4071319.0, "reward": 11.971752166748047, "reward_std": 4.021405220031738, "rewards/fitness_reward/mean": 6.9682512283325195, "rewards/fitness_reward/std": 1.9147640466690063, "rewards/kidney_reward/mean": 2.3168246746063232, "rewards/kidney_reward/std": 0.8652597069740295, "rewards/length2tails_reward/mean": 0.7702042460441589, "rewards/length2tails_reward/std": 0.32408878207206726, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.509655475616455, "rewards/thermo_reward/std": 1.7250123023986816, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 266.15625, "completions/mean_terminated_length": 266.15625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.10231568291783333, "epoch": 0.94, "frac_reward_zero_std": 0.0, "grad_norm": 0.22313550114631653, "learning_rate": 1.8591057695483e-06, "loss": -0.0678, "num_tokens": 4079868.0, "reward": 9.78188705444336, "reward_std": 8.069836616516113, "rewards/fitness_reward/mean": 5.98593807220459, "rewards/fitness_reward/std": 3.7097551822662354, "rewards/kidney_reward/mean": 1.5625232458114624, "rewards/kidney_reward/std": 2.1295108795166016, "rewards/length2tails_reward/mean": 0.7279834747314453, "rewards/length2tails_reward/std": 0.35069504380226135, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.0606276988983154, "rewards/thermo_reward/std": 2.832650899887085, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.0625, "completions/mean_terminated_length": 271.0625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.09819444455206394, "epoch": 0.942, "frac_reward_zero_std": 0.0, "grad_norm": 0.07939817011356354, "learning_rate": 1.858448793601866e-06, "loss": -0.0048, "num_tokens": 4088574.0, "reward": 12.200660705566406, "reward_std": 3.8725593090057373, "rewards/fitness_reward/mean": 6.886867046356201, "rewards/fitness_reward/std": 2.0616986751556396, "rewards/kidney_reward/mean": 2.4161300659179688, "rewards/kidney_reward/std": 0.6241863369941711, "rewards/length2tails_reward/mean": 0.6536673903465271, "rewards/length2tails_reward/std": 0.3719845116138458, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.732297420501709, "rewards/thermo_reward/std": 2.0418496131896973, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.0, "completions/mean_terminated_length": 271.0, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11007297784090042, "epoch": 0.944, "frac_reward_zero_std": 0.0, "grad_norm": 0.06128634884953499, "learning_rate": 1.8577904061520866e-06, "loss": -0.0081, "num_tokens": 4097278.0, "reward": 12.243531227111816, "reward_std": 3.5722525119781494, "rewards/fitness_reward/mean": 6.99554443359375, "rewards/fitness_reward/std": 1.7628074884414673, "rewards/kidney_reward/mean": 2.3417837619781494, "rewards/kidney_reward/std": 0.7228731513023376, "rewards/length2tails_reward/mean": 0.6581995487213135, "rewards/length2tails_reward/std": 0.339339941740036, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.7403831481933594, "rewards/thermo_reward/std": 1.9604158401489258, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.84375, "completions/mean_terminated_length": 269.84375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.09462772868573666, "epoch": 0.946, "frac_reward_zero_std": 0.0, "grad_norm": 0.09483262896537781, "learning_rate": 1.8571306082815148e-06, "loss": -0.0077, "num_tokens": 4105945.0, "reward": 10.221860885620117, "reward_std": 4.868222713470459, "rewards/fitness_reward/mean": 6.629333972930908, "rewards/fitness_reward/std": 2.439204692840576, "rewards/kidney_reward/mean": 2.050110340118408, "rewards/kidney_reward/std": 1.2794262170791626, "rewards/length2tails_reward/mean": 0.554818868637085, "rewards/length2tails_reward/std": 0.39995256066322327, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.3869357109069824, "rewards/thermo_reward/std": 2.6503026485443115, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.4375, "completions/mean_terminated_length": 270.4375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10915269795805216, "epoch": 0.948, "frac_reward_zero_std": 0.0, "grad_norm": 0.1461164355278015, "learning_rate": 1.8564694010750221e-06, "loss": 0.0002, "num_tokens": 4114631.0, "reward": 10.527725219726562, "reward_std": 6.852277755737305, "rewards/fitness_reward/mean": 6.522241592407227, "rewards/fitness_reward/std": 2.871361255645752, "rewards/kidney_reward/mean": 1.8803939819335938, "rewards/kidney_reward/std": 1.8421525955200195, "rewards/length2tails_reward/mean": 0.5725621581077576, "rewards/length2tails_reward/std": 0.3809102475643158, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.967833161354065, "rewards/thermo_reward/std": 2.5932414531707764, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.0, "completions/mean_terminated_length": 271.0, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10617079585790634, "epoch": 0.95, "frac_reward_zero_std": 0.0, "grad_norm": 0.16131462156772614, "learning_rate": 1.8558067856197975e-06, "loss": -0.0002, "num_tokens": 4123335.0, "reward": 11.356301307678223, "reward_std": 4.680373668670654, "rewards/fitness_reward/mean": 7.0524444580078125, "rewards/fitness_reward/std": 1.746500015258789, "rewards/kidney_reward/mean": 2.194664478302002, "rewards/kidney_reward/std": 1.2206449508666992, "rewards/length2tails_reward/mean": 0.6265060901641846, "rewards/length2tails_reward/std": 0.3382713198661804, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.9465417861938477, "rewards/thermo_reward/std": 2.584482431411743, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.0, "completions/mean_terminated_length": 270.0, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10205241851508617, "epoch": 0.952, "frac_reward_zero_std": 0.0, "grad_norm": 0.1406603455543518, "learning_rate": 1.8551427630053463e-06, "loss": -0.0054, "num_tokens": 4132007.0, "reward": 12.318782806396484, "reward_std": 2.5967447757720947, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.3722710609436035, "rewards/kidney_reward/std": 0.8007427453994751, "rewards/length2tails_reward/mean": 0.6015236973762512, "rewards/length2tails_reward/std": 0.35340017080307007, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.4826831817626953, "rewards/thermo_reward/std": 1.8363468647003174, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.90625, "completions/mean_terminated_length": 270.90625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10897734574973583, "epoch": 0.954, "frac_reward_zero_std": 0.0, "grad_norm": 0.07693850249052048, "learning_rate": 1.8544773343234858e-06, "loss": -0.0035, "num_tokens": 4140708.0, "reward": 11.850772857666016, "reward_std": 4.4592084884643555, "rewards/fitness_reward/mean": 6.938035011291504, "rewards/fitness_reward/std": 1.840762972831726, "rewards/kidney_reward/mean": 2.3154757022857666, "rewards/kidney_reward/std": 1.0696473121643066, "rewards/length2tails_reward/mean": 0.626863956451416, "rewards/length2tails_reward/std": 0.38042908906936646, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.4345757961273193, "rewards/thermo_reward/std": 2.333374261856079, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.4375, "completions/mean_terminated_length": 270.4375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10131760127842426, "epoch": 0.956, "frac_reward_zero_std": 0.0, "grad_norm": 0.07668693363666534, "learning_rate": 1.853810500668347e-06, "loss": -0.0044, "num_tokens": 4149394.0, "reward": 12.450238227844238, "reward_std": 2.4274275302886963, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.423727035522461, "rewards/kidney_reward/std": 0.5866073369979858, "rewards/length2tails_reward/mean": 0.6329134106636047, "rewards/length2tails_reward/std": 0.32143205404281616, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.502035617828369, "rewards/thermo_reward/std": 2.041130542755127, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.10201771184802055, "epoch": 0.958, "frac_reward_zero_std": 0.0, "grad_norm": 0.08321689069271088, "learning_rate": 1.8531422631363704e-06, "loss": -0.003, "num_tokens": 4158110.0, "reward": 12.717397689819336, "reward_std": 2.4608662128448486, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4727680683135986, "rewards/kidney_reward/std": 0.7264254689216614, "rewards/length2tails_reward/mean": 0.6925871968269348, "rewards/length2tails_reward/std": 0.31042250990867615, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.7141854763031006, "rewards/thermo_reward/std": 1.9494096040725708, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.625, "completions/mean_terminated_length": 270.625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.1034108754247427, "epoch": 0.96, "frac_reward_zero_std": 0.0, "grad_norm": 0.05778651311993599, "learning_rate": 1.8524726228263044e-06, "loss": -0.0086, "num_tokens": 4166802.0, "reward": 11.463024139404297, "reward_std": 4.595519065856934, "rewards/fitness_reward/mean": 6.678974151611328, "rewards/fitness_reward/std": 2.4648430347442627, "rewards/kidney_reward/mean": 2.2850759029388428, "rewards/kidney_reward/std": 1.023033618927002, "rewards/length2tails_reward/mean": 0.5975102782249451, "rewards/length2tails_reward/std": 0.3732609152793884, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.339221954345703, "rewards/thermo_reward/std": 2.1146011352539062, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 269.59375, "completions/mean_terminated_length": 269.59375, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.11340959277004004, "epoch": 0.962, "frac_reward_zero_std": 0.0, "grad_norm": 0.6065387725830078, "learning_rate": 1.8518015808392043e-06, "loss": -0.0232, "num_tokens": 4175461.0, "reward": 10.408863067626953, "reward_std": 7.76609992980957, "rewards/fitness_reward/mean": 6.237054824829102, "rewards/fitness_reward/std": 3.5606303215026855, "rewards/kidney_reward/mean": 1.810880184173584, "rewards/kidney_reward/std": 1.9630531072616577, "rewards/length2tails_reward/mean": 0.7209354639053345, "rewards/length2tails_reward/std": 0.31308096647262573, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.1888349056243896, "rewards/thermo_reward/std": 2.5315160751342773, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.03125, "completions/mean_terminated_length": 272.03125, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "entropy": 0.10557975806295872, "epoch": 0.964, "frac_reward_zero_std": 0.0, "grad_norm": 0.5803879499435425, "learning_rate": 1.8511291382784297e-06, "loss": -0.0099, "num_tokens": 4184198.0, "reward": 11.030176162719727, "reward_std": 6.811028003692627, "rewards/fitness_reward/mean": 6.34883975982666, "rewards/fitness_reward/std": 3.2230067253112793, "rewards/kidney_reward/mean": 2.0069994926452637, "rewards/kidney_reward/std": 1.6315367221832275, "rewards/length2tails_reward/mean": 0.7537314891815186, "rewards/length2tails_reward/std": 0.3153182864189148, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.4989633560180664, "rewards/thermo_reward/std": 2.3130509853363037, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.71875, "completions/mean_terminated_length": 269.71875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09722129534929991, "epoch": 0.966, "frac_reward_zero_std": 0.0, "grad_norm": 0.09880959242582321, "learning_rate": 1.850455296249644e-06, "loss": -0.0078, "num_tokens": 4192861.0, "reward": 10.829471588134766, "reward_std": 4.066839218139648, "rewards/fitness_reward/mean": 6.840295791625977, "rewards/fitness_reward/std": 2.014033079147339, "rewards/kidney_reward/mean": 2.231708526611328, "rewards/kidney_reward/std": 0.9333588480949402, "rewards/length2tails_reward/mean": 0.514818012714386, "rewards/length2tails_reward/std": 0.397733211517334, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.6059857606887817, "rewards/thermo_reward/std": 2.648850202560425, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.28125, "completions/mean_terminated_length": 272.28125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10204268898814917, "epoch": 0.968, "frac_reward_zero_std": 0.0, "grad_norm": 0.09644491225481033, "learning_rate": 1.849780055860811e-06, "loss": -0.0005, "num_tokens": 4201606.0, "reward": 12.977973937988281, "reward_std": 1.523888349533081, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.517043113708496, "rewards/kidney_reward/std": 0.2941751182079315, "rewards/length2tails_reward/mean": 0.7408137917518616, "rewards/length2tails_reward/std": 0.29664987325668335, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9256644248962402, "rewards/thermo_reward/std": 1.3327876329421997, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.6875, "completions/mean_terminated_length": 269.6875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10017648991197348, "epoch": 0.97, "frac_reward_zero_std": 0.0, "grad_norm": 0.12505213916301727, "learning_rate": 1.8491034182221936e-06, "loss": -0.0048, "num_tokens": 4210268.0, "reward": 11.93092155456543, "reward_std": 3.9217958450317383, "rewards/fitness_reward/mean": 6.938035011291504, "rewards/fitness_reward/std": 1.78042471408844, "rewards/kidney_reward/mean": 2.246797800064087, "rewards/kidney_reward/std": 0.8911625742912292, "rewards/length2tails_reward/mean": 0.5875948667526245, "rewards/length2tails_reward/std": 0.3603399097919464, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.5873284339904785, "rewards/thermo_reward/std": 1.9247843027114868, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.46875, "completions/mean_terminated_length": 270.46875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.0971584739163518, "epoch": 0.972, "frac_reward_zero_std": 0.0, "grad_norm": 0.11438634991645813, "learning_rate": 1.8484253844463524e-06, "loss": -0.0076, "num_tokens": 4218955.0, "reward": 12.453248977661133, "reward_std": 2.544297218322754, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.3667173385620117, "rewards/kidney_reward/std": 0.725242018699646, "rewards/length2tails_reward/mean": 0.5997471213340759, "rewards/length2tails_reward/std": 0.3877236247062683, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.622880458831787, "rewards/thermo_reward/std": 1.9670342206954956, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.78125, "completions/mean_terminated_length": 272.78125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.10207152832299471, "epoch": 0.974, "frac_reward_zero_std": 0.0, "grad_norm": 0.08598785102367401, "learning_rate": 1.8477459556481437e-06, "loss": -0.004, "num_tokens": 4227716.0, "reward": 11.039971351623535, "reward_std": 5.43131160736084, "rewards/fitness_reward/mean": 6.3909101486206055, "rewards/fitness_reward/std": 3.0708882808685303, "rewards/kidney_reward/mean": 2.193580150604248, "rewards/kidney_reward/std": 1.1655123233795166, "rewards/length2tails_reward/mean": 0.7655168771743774, "rewards/length2tails_reward/std": 0.3056708872318268, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.2789292335510254, "rewards/thermo_reward/std": 2.220484495162964, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.1875, "completions/mean_terminated_length": 271.1875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10653836652636528, "epoch": 0.976, "frac_reward_zero_std": 0.0, "grad_norm": 0.07108797878026962, "learning_rate": 1.8470651329447175e-06, "loss": -0.006, "num_tokens": 4236426.0, "reward": 12.28726577758789, "reward_std": 2.68487548828125, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.476879596710205, "rewards/kidney_reward/std": 0.5689665675163269, "rewards/length2tails_reward/mean": 0.6496505737304688, "rewards/length2tails_reward/std": 0.39126691222190857, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.284236431121826, "rewards/thermo_reward/std": 2.293436288833618, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.5, "completions/mean_terminated_length": 270.5, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09677772503346205, "epoch": 0.978, "frac_reward_zero_std": 0.0, "grad_norm": 0.05292100831866264, "learning_rate": 1.8463829174555157e-06, "loss": -0.0025, "num_tokens": 4245114.0, "reward": 11.512476921081543, "reward_std": 6.075210094451904, "rewards/fitness_reward/mean": 6.321190357208252, "rewards/fitness_reward/std": 3.2969279289245605, "rewards/kidney_reward/mean": 2.188544750213623, "rewards/kidney_reward/std": 1.2047877311706543, "rewards/length2tails_reward/mean": 0.6134219169616699, "rewards/length2tails_reward/std": 0.3425354063510895, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8413994312286377, "rewards/thermo_reward/std": 1.940303087234497, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.875, "completions/mean_terminated_length": 270.875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10276855435222387, "epoch": 0.98, "frac_reward_zero_std": 0.0, "grad_norm": 0.08010396361351013, "learning_rate": 1.8456993103022703e-06, "loss": -0.0096, "num_tokens": 4253814.0, "reward": 10.646703720092773, "reward_std": 5.514558792114258, "rewards/fitness_reward/mean": 6.77551794052124, "rewards/fitness_reward/std": 2.1163885593414307, "rewards/kidney_reward/mean": 1.8354774713516235, "rewards/kidney_reward/std": 1.6018898487091064, "rewards/length2tails_reward/mean": 0.6580426692962646, "rewards/length2tails_reward/std": 0.3390595018863678, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.8699034452438354, "rewards/thermo_reward/std": 2.7576637268066406, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.09375, "completions/mean_terminated_length": 272.09375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10569905303418636, "epoch": 0.982, "frac_reward_zero_std": 0.0, "grad_norm": 0.06571035832166672, "learning_rate": 1.8450143126090012e-06, "loss": -0.005, "num_tokens": 4262553.0, "reward": 13.003958702087402, "reward_std": 2.756281852722168, "rewards/fitness_reward/mean": 6.987574577331543, "rewards/fitness_reward/std": 2.1134605407714844, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7179726362228394, "rewards/length2tails_reward/std": 0.3528580963611603, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2728257179260254, "rewards/thermo_reward/std": 1.1995137929916382, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.0, "completions/mean_terminated_length": 272.0, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10283433459699154, "epoch": 0.984, "frac_reward_zero_std": 0.0, "grad_norm": 0.08230716735124588, "learning_rate": 1.844327925502015e-06, "loss": -0.0026, "num_tokens": 4271289.0, "reward": 10.768950462341309, "reward_std": 6.651513576507568, "rewards/fitness_reward/mean": 6.357293128967285, "rewards/fitness_reward/std": 3.1766951084136963, "rewards/kidney_reward/mean": 2.013303756713867, "rewards/kidney_reward/std": 1.531698226928711, "rewards/length2tails_reward/mean": 0.6933887004852295, "rewards/length2tails_reward/std": 0.32781982421875, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.2290148735046387, "rewards/thermo_reward/std": 2.4779889583587646, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.90625, "completions/mean_terminated_length": 271.90625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.10609956178814173, "epoch": 0.986, "frac_reward_zero_std": 0.0, "grad_norm": 0.12279146909713745, "learning_rate": 1.8436401501099033e-06, "loss": 0.0054, "num_tokens": 4280022.0, "reward": 11.089923858642578, "reward_std": 5.652596473693848, "rewards/fitness_reward/mean": 6.703958034515381, "rewards/fitness_reward/std": 2.588440179824829, "rewards/kidney_reward/mean": 2.064419746398926, "rewards/kidney_reward/std": 1.4529000520706177, "rewards/length2tails_reward/mean": 0.7086784839630127, "rewards/length2tails_reward/std": 0.3152945637702942, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.1506786346435547, "rewards/thermo_reward/std": 2.2029199600219727, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.4375, "completions/mean_terminated_length": 272.4375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11765718180686235, "epoch": 0.988, "frac_reward_zero_std": 0.0, "grad_norm": 0.08861474692821503, "learning_rate": 1.8429509875635394e-06, "loss": 0.0, "num_tokens": 4288772.0, "reward": 12.592763900756836, "reward_std": 2.527524471282959, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.450157403945923, "rewards/kidney_reward/std": 0.5792059302330017, "rewards/length2tails_reward/mean": 0.7417012453079224, "rewards/length2tails_reward/std": 0.3148317039012909, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.664761781692505, "rewards/thermo_reward/std": 1.8582063913345337, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.65625, "completions/mean_terminated_length": 271.65625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10666430741548538, "epoch": 0.99, "frac_reward_zero_std": 0.0, "grad_norm": 0.06550094485282898, "learning_rate": 1.8422604389960781e-06, "loss": -0.0037, "num_tokens": 4297497.0, "reward": 13.099773406982422, "reward_std": 1.8023239374160767, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7044112682342529, "rewards/length2tails_reward/std": 0.34925153851509094, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.996385097503662, "rewards/thermo_reward/std": 1.6214781999588013, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.96875, "completions/mean_terminated_length": 272.96875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11526863928884268, "epoch": 0.992, "frac_reward_zero_std": 0.0, "grad_norm": 0.06436719745397568, "learning_rate": 1.841568505542953e-06, "loss": 0.0007, "num_tokens": 4306264.0, "reward": 12.25389575958252, "reward_std": 4.86123514175415, "rewards/fitness_reward/mean": 6.996882438659668, "rewards/fitness_reward/std": 2.060805082321167, "rewards/kidney_reward/mean": 2.236731767654419, "rewards/kidney_reward/std": 1.2397674322128296, "rewards/length2tails_reward/mean": 0.796875, "rewards/length2tails_reward/std": 0.2885507345199585, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8405933380126953, "rewards/thermo_reward/std": 2.0179951190948486, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.125, "completions/mean_terminated_length": 271.125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11093102768063545, "epoch": 0.994, "frac_reward_zero_std": 0.0, "grad_norm": 0.10350531339645386, "learning_rate": 1.8408751883418752e-06, "loss": -0.0026, "num_tokens": 4314972.0, "reward": 11.615776062011719, "reward_std": 5.574860572814941, "rewards/fitness_reward/mean": 6.668831825256348, "rewards/fitness_reward/std": 2.5053114891052246, "rewards/kidney_reward/mean": 2.082124710083008, "rewards/kidney_reward/std": 1.3706881999969482, "rewards/length2tails_reward/mean": 0.645334005355835, "rewards/length2tails_reward/std": 0.36578190326690674, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.700286865234375, "rewards/thermo_reward/std": 2.2157039642333984, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.96875, "completions/mean_terminated_length": 271.96875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.1063738688826561, "epoch": 0.996, "frac_reward_zero_std": 0.0, "grad_norm": 0.07675768435001373, "learning_rate": 1.84018048853283e-06, "loss": -0.003, "num_tokens": 4323707.0, "reward": 12.359034538269043, "reward_std": 3.6783368587493896, "rewards/fitness_reward/mean": 6.99554443359375, "rewards/fitness_reward/std": 1.7628074884414673, "rewards/kidney_reward/mean": 2.3726367950439453, "rewards/kidney_reward/std": 0.804735004901886, "rewards/length2tails_reward/mean": 0.6753664612770081, "rewards/length2tails_reward/std": 0.3424879014492035, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.823317050933838, "rewards/thermo_reward/std": 1.7088568210601807, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.96875, "completions/mean_terminated_length": 270.96875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09414213243871927, "epoch": 0.998, "frac_reward_zero_std": 0.0, "grad_norm": 0.08835254609584808, "learning_rate": 1.8394844072580772e-06, "loss": -0.0066, "num_tokens": 4332410.0, "reward": 12.498444557189941, "reward_std": 2.584066152572632, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.452592670917511, "rewards/kidney_reward/mean": 2.3080384731292725, "rewards/kidney_reward/std": 1.0372236967086792, "rewards/length2tails_reward/mean": 0.6141963601112366, "rewards/length2tails_reward/std": 0.36699378490448, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.782820463180542, "rewards/thermo_reward/std": 1.6219178438186646, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.28125, "completions/mean_terminated_length": 271.28125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11134423781186342, "epoch": 1.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.054818034172058105, "learning_rate": 1.8387869456621473e-06, "loss": -0.0017, "num_tokens": 4341123.0, "reward": 11.53868293762207, "reward_std": 5.201396465301514, "rewards/fitness_reward/mean": 6.6967315673828125, "rewards/fitness_reward/std": 2.6219189167022705, "rewards/kidney_reward/mean": 2.3333699703216553, "rewards/kidney_reward/std": 1.1131701469421387, "rewards/length2tails_reward/mean": 0.70151686668396, "rewards/length2tails_reward/std": 0.3320710062980652, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.3384299278259277, "rewards/thermo_reward/std": 2.0443801879882812, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.46875, "completions/mean_terminated_length": 272.46875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.1253562057390809, "epoch": 1.002, "frac_reward_zero_std": 0.0, "grad_norm": 0.06054362282156944, "learning_rate": 1.8380881048918404e-06, "loss": 0.002, "num_tokens": 4349874.0, "reward": 12.315496444702148, "reward_std": 4.542181968688965, "rewards/fitness_reward/mean": 7.001006126403809, "rewards/fitness_reward/std": 2.0374791622161865, "rewards/kidney_reward/mean": 2.3622498512268066, "rewards/kidney_reward/std": 1.04790198802948, "rewards/length2tails_reward/mean": 0.7722654342651367, "rewards/length2tails_reward/std": 0.2462671995162964, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.775014877319336, "rewards/thermo_reward/std": 1.8799045085906982, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.25, "completions/mean_terminated_length": 271.25, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11229314375668764, "epoch": 1.004, "frac_reward_zero_std": 0.0, "grad_norm": 0.24925032258033752, "learning_rate": 1.8373878860962253e-06, "loss": 0.0003, "num_tokens": 4358586.0, "reward": 11.532876968383789, "reward_std": 5.967067241668701, "rewards/fitness_reward/mean": 6.932981967926025, "rewards/fitness_reward/std": 2.4222817420959473, "rewards/kidney_reward/mean": 2.1588492393493652, "rewards/kidney_reward/std": 1.5926848649978638, "rewards/length2tails_reward/mean": 0.6957947015762329, "rewards/length2tails_reward/std": 0.29977282881736755, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.271465301513672, "rewards/thermo_reward/std": 2.5201990604400635, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.03125, "completions/mean_terminated_length": 272.03125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0931809302419424, "epoch": 1.006, "frac_reward_zero_std": 0.0, "grad_norm": 0.06151973456144333, "learning_rate": 1.8366862904266354e-06, "loss": -0.0089, "num_tokens": 4367323.0, "reward": 11.453720092773438, "reward_std": 4.2606096267700195, "rewards/fitness_reward/mean": 6.628477096557617, "rewards/fitness_reward/std": 2.486849546432495, "rewards/kidney_reward/mean": 2.3373258113861084, "rewards/kidney_reward/std": 0.7392154932022095, "rewards/length2tails_reward/mean": 0.6723840236663818, "rewards/length2tails_reward/std": 0.3951359689235687, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.3206787109375, "rewards/thermo_reward/std": 1.9403377771377563, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.90625, "completions/mean_terminated_length": 271.90625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10857211658731103, "epoch": 1.008, "frac_reward_zero_std": 0.0, "grad_norm": 0.08904221653938293, "learning_rate": 1.8359833190366683e-06, "loss": -0.0083, "num_tokens": 4376056.0, "reward": 11.623477935791016, "reward_std": 4.135257244110107, "rewards/fitness_reward/mean": 6.969882488250732, "rewards/fitness_reward/std": 1.9056702852249146, "rewards/kidney_reward/mean": 2.1147756576538086, "rewards/kidney_reward/std": 1.0822776556015015, "rewards/length2tails_reward/mean": 0.7080065011978149, "rewards/length2tails_reward/std": 0.3449402153491974, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.368018627166748, "rewards/thermo_reward/std": 2.2309768199920654, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.875, "completions/mean_terminated_length": 271.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.1062892684713006, "epoch": 1.01, "frac_reward_zero_std": 0.0, "grad_norm": 0.058957964181900024, "learning_rate": 1.835278973082184e-06, "loss": -0.0043, "num_tokens": 4384788.0, "reward": 11.644981384277344, "reward_std": 5.438553333282471, "rewards/fitness_reward/mean": 6.697482585906982, "rewards/fitness_reward/std": 2.618746280670166, "rewards/kidney_reward/mean": 2.15616774559021, "rewards/kidney_reward/std": 1.2606943845748901, "rewards/length2tails_reward/mean": 0.7042187452316284, "rewards/length2tails_reward/std": 0.3460327088832855, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.6209092140197754, "rewards/thermo_reward/std": 2.147388458251953, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 270.1875, "completions/mean_terminated_length": 270.1875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09534286428242922, "epoch": 1.012, "frac_reward_zero_std": 0.0, "grad_norm": 0.07244093716144562, "learning_rate": 1.8345732537213026e-06, "loss": -0.0038, "num_tokens": 4393466.0, "reward": 10.906755447387695, "reward_std": 5.130542278289795, "rewards/fitness_reward/mean": 6.565263748168945, "rewards/fitness_reward/std": 2.7279858589172363, "rewards/kidney_reward/mean": 2.2643046379089355, "rewards/kidney_reward/std": 1.112157940864563, "rewards/length2tails_reward/mean": 0.5872292518615723, "rewards/length2tails_reward/std": 0.38594943284988403, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.9184648990631104, "rewards/thermo_reward/std": 2.4295599460601807, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.71875, "completions/mean_terminated_length": 271.71875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11011694930493832, "epoch": 1.014, "frac_reward_zero_std": 0.0, "grad_norm": 0.10459110885858536, "learning_rate": 1.8338661621144022e-06, "loss": -0.0045, "num_tokens": 4402193.0, "reward": 11.295709609985352, "reward_std": 6.177306652069092, "rewards/fitness_reward/mean": 6.025014877319336, "rewards/fitness_reward/std": 3.602267026901245, "rewards/kidney_reward/mean": 2.210982084274292, "rewards/kidney_reward/std": 1.167697787284851, "rewards/length2tails_reward/mean": 0.696526288986206, "rewards/length2tails_reward/std": 0.3571210205554962, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.890059471130371, "rewards/thermo_reward/std": 1.7832286357879639, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.09375, "completions/mean_terminated_length": 271.09375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09178171120584011, "epoch": 1.016, "frac_reward_zero_std": 0.0, "grad_norm": 0.0839139074087143, "learning_rate": 1.833157699424117e-06, "loss": -0.0069, "num_tokens": 4410900.0, "reward": 12.223529815673828, "reward_std": 3.18105149269104, "rewards/fitness_reward/mean": 7.0122480392456055, "rewards/fitness_reward/std": 1.973885416984558, "rewards/kidney_reward/mean": 2.4003708362579346, "rewards/kidney_reward/std": 0.5782254934310913, "rewards/length2tails_reward/mean": 0.65845787525177, "rewards/length2tails_reward/std": 0.329045832157135, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.645064353942871, "rewards/thermo_reward/std": 1.9347671270370483, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 0.10174894332885742, "epoch": 1.018, "frac_reward_zero_std": 0.0, "grad_norm": 0.10819156467914581, "learning_rate": 1.8324478668153366e-06, "loss": -0.0057, "num_tokens": 4419616.0, "reward": 11.47518539428711, "reward_std": 5.487979888916016, "rewards/fitness_reward/mean": 6.920076370239258, "rewards/fitness_reward/std": 2.183814287185669, "rewards/kidney_reward/mean": 2.144624948501587, "rewards/kidney_reward/std": 1.366068959236145, "rewards/length2tails_reward/mean": 0.6623252630233765, "rewards/length2tails_reward/std": 0.3512051999568939, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.244251012802124, "rewards/thermo_reward/std": 2.4937503337860107, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 267.28125, "completions/mean_terminated_length": 267.28125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.09971592016518116, "epoch": 1.02, "frac_reward_zero_std": 0.0, "grad_norm": 0.2520746886730194, "learning_rate": 1.8317366654552013e-06, "loss": -0.0522, "num_tokens": 4428201.0, "reward": 10.527538299560547, "reward_std": 6.803934574127197, "rewards/fitness_reward/mean": 6.230249881744385, "rewards/fitness_reward/std": 3.576293468475342, "rewards/kidney_reward/mean": 2.0987014770507812, "rewards/kidney_reward/std": 1.649312973022461, "rewards/length2tails_reward/mean": 0.6444635391235352, "rewards/length2tails_reward/std": 0.3857713043689728, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.0341413021087646, "rewards/thermo_reward/std": 2.3133511543273926, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.53125, "completions/mean_terminated_length": 271.53125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.1145568871870637, "epoch": 1.022, "frac_reward_zero_std": 0.0, "grad_norm": 0.04647472873330116, "learning_rate": 1.8310240965131038e-06, "loss": -0.0035, "num_tokens": 4436922.0, "reward": 11.225116729736328, "reward_std": 5.405675411224365, "rewards/fitness_reward/mean": 6.641835689544678, "rewards/fitness_reward/std": 2.6161036491394043, "rewards/kidney_reward/mean": 2.1738617420196533, "rewards/kidney_reward/std": 1.2172448635101318, "rewards/length2tails_reward/mean": 0.7327454686164856, "rewards/length2tails_reward/std": 0.29265257716178894, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.2361443042755127, "rewards/thermo_reward/std": 2.1350820064544678, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.75, "completions/mean_terminated_length": 270.75, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10694421455264091, "epoch": 1.024, "frac_reward_zero_std": 0.0, "grad_norm": 0.09988442808389664, "learning_rate": 1.8303101611606847e-06, "loss": -0.0026, "num_tokens": 4445618.0, "reward": 11.589323043823242, "reward_std": 3.4726717472076416, "rewards/fitness_reward/mean": 6.948733806610107, "rewards/fitness_reward/std": 2.023653745651245, "rewards/kidney_reward/mean": 2.2359910011291504, "rewards/kidney_reward/std": 0.8204216957092285, "rewards/length2tails_reward/mean": 0.6413706541061401, "rewards/length2tails_reward/std": 0.3659186363220215, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.2404606342315674, "rewards/thermo_reward/std": 2.057912826538086, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.75, "completions/mean_terminated_length": 269.75, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.0958978058770299, "epoch": 1.026, "frac_reward_zero_std": 0.0, "grad_norm": 0.1153431311249733, "learning_rate": 1.8295948605718311e-06, "loss": -0.0033, "num_tokens": 4454282.0, "reward": 11.745344161987305, "reward_std": 5.108170986175537, "rewards/fitness_reward/mean": 6.678919792175293, "rewards/fitness_reward/std": 2.696547746658325, "rewards/kidney_reward/mean": 2.1953866481781006, "rewards/kidney_reward/std": 1.3030292987823486, "rewards/length2tails_reward/mean": 0.7217558026313782, "rewards/length2tails_reward/std": 0.306129515171051, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.698862075805664, "rewards/thermo_reward/std": 2.0775516033172607, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.6875, "completions/mean_terminated_length": 270.6875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10792504902929068, "epoch": 1.028, "frac_reward_zero_std": 0.0, "grad_norm": 0.1147734671831131, "learning_rate": 1.828878195922675e-06, "loss": -0.0057, "num_tokens": 4462976.0, "reward": 11.455698013305664, "reward_std": 3.909986734390259, "rewards/fitness_reward/mean": 6.936039924621582, "rewards/fitness_reward/std": 1.791343092918396, "rewards/kidney_reward/mean": 2.275510549545288, "rewards/kidney_reward/std": 1.045230746269226, "rewards/length2tails_reward/mean": 0.6162968277931213, "rewards/length2tails_reward/std": 0.3768027722835541, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.0825178623199463, "rewards/thermo_reward/std": 2.351339340209961, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09783510770648718, "epoch": 1.03, "frac_reward_zero_std": 0.0, "grad_norm": 0.08594634383916855, "learning_rate": 1.8281601683915914e-06, "loss": -0.0072, "num_tokens": 4471692.0, "reward": 11.942680358886719, "reward_std": 4.303277015686035, "rewards/fitness_reward/mean": 7.026032447814941, "rewards/fitness_reward/std": 1.8959089517593384, "rewards/kidney_reward/mean": 2.2736551761627197, "rewards/kidney_reward/std": 0.9514713883399963, "rewards/length2tails_reward/mean": 0.6495813131332397, "rewards/length2tails_reward/std": 0.35913750529289246, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.478034496307373, "rewards/thermo_reward/std": 2.3042123317718506, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.34375, "completions/mean_terminated_length": 271.34375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10969353280961514, "epoch": 1.032, "frac_reward_zero_std": 0.0, "grad_norm": 0.06955910474061966, "learning_rate": 1.8274407791591963e-06, "loss": -0.0085, "num_tokens": 4480407.0, "reward": 12.465177536010742, "reward_std": 3.026686429977417, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.452592670917511, "rewards/kidney_reward/mean": 2.355340003967285, "rewards/kidney_reward/std": 0.8772143721580505, "rewards/length2tails_reward/mean": 0.7324072122573853, "rewards/length2tails_reward/std": 0.2933201789855957, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.690430164337158, "rewards/thermo_reward/std": 1.988502025604248, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10960424691438675, "epoch": 1.034, "frac_reward_zero_std": 0.0, "grad_norm": 0.09094872325658798, "learning_rate": 1.8267200294083446e-06, "loss": -0.0049, "num_tokens": 4489171.0, "reward": 12.051908493041992, "reward_std": 3.2191474437713623, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.650642991065979, "rewards/kidney_reward/mean": 2.3650550842285156, "rewards/kidney_reward/std": 0.7524277567863464, "rewards/length2tails_reward/mean": 0.7414542436599731, "rewards/length2tails_reward/std": 0.29311615228652954, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.2665421962738037, "rewards/thermo_reward/std": 2.147674322128296, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.71875, "completions/mean_terminated_length": 271.71875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09909907821565866, "epoch": 1.036, "frac_reward_zero_std": 0.0, "grad_norm": 0.07928860932588577, "learning_rate": 1.8259979203241278e-06, "loss": -0.0026, "num_tokens": 4497898.0, "reward": 12.741703033447266, "reward_std": 2.7505831718444824, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.452592670917511, "rewards/kidney_reward/mean": 2.4167726039886475, "rewards/kidney_reward/std": 0.7528964281082153, "rewards/length2tails_reward/mean": 0.6687544584274292, "rewards/length2tails_reward/std": 0.3758578896522522, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9118881225585938, "rewards/thermo_reward/std": 1.768688440322876, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.71875, "completions/mean_terminated_length": 269.71875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08322983188554645, "epoch": 1.038, "frac_reward_zero_std": 0.0, "grad_norm": 0.14451418817043304, "learning_rate": 1.8252744530938734e-06, "loss": -0.0065, "num_tokens": 4506561.0, "reward": 11.721282958984375, "reward_std": 4.508687973022461, "rewards/fitness_reward/mean": 6.937592506408691, "rewards/fitness_reward/std": 1.7828460931777954, "rewards/kidney_reward/mean": 2.0562925338745117, "rewards/kidney_reward/std": 1.4569692611694336, "rewards/length2tails_reward/mean": 0.5203403234481812, "rewards/length2tails_reward/std": 0.38675656914711, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.575364112854004, "rewards/thermo_reward/std": 2.1809372901916504, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.65625, "completions/mean_terminated_length": 270.65625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10837945714592934, "epoch": 1.04, "frac_reward_zero_std": 0.0, "grad_norm": 0.07109640538692474, "learning_rate": 1.8245496289071406e-06, "loss": -0.0024, "num_tokens": 4515254.0, "reward": 11.002652168273926, "reward_std": 6.390777111053467, "rewards/fitness_reward/mean": 6.532375812530518, "rewards/fitness_reward/std": 2.853579044342041, "rewards/kidney_reward/mean": 2.061156988143921, "rewards/kidney_reward/std": 1.530630111694336, "rewards/length2tails_reward/mean": 0.6790802478790283, "rewards/length2tails_reward/std": 0.2736893594264984, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.2412116527557373, "rewards/thermo_reward/std": 2.4915144443511963, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.34375, "completions/mean_terminated_length": 272.34375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11088752839714289, "epoch": 1.042, "frac_reward_zero_std": 0.0, "grad_norm": 0.12135175615549088, "learning_rate": 1.8238234489557214e-06, "loss": -0.0017, "num_tokens": 4524001.0, "reward": 12.2737398147583, "reward_std": 3.4874472618103027, "rewards/fitness_reward/mean": 7.131148338317871, "rewards/fitness_reward/std": 0.905185341835022, "rewards/kidney_reward/mean": 2.2964911460876465, "rewards/kidney_reward/std": 1.0294334888458252, "rewards/length2tails_reward/mean": 0.7129369974136353, "rewards/length2tails_reward/std": 0.3501805067062378, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.674807071685791, "rewards/thermo_reward/std": 2.00650691986084, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 270.75, "completions/mean_terminated_length": 270.75, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10240734275430441, "epoch": 1.044, "frac_reward_zero_std": 0.0, "grad_norm": 0.08196910470724106, "learning_rate": 1.8230959144336361e-06, "loss": -0.0046, "num_tokens": 4532697.0, "reward": 11.078343391418457, "reward_std": 3.847487688064575, "rewards/fitness_reward/mean": 7.131148338317871, "rewards/fitness_reward/std": 0.6183593273162842, "rewards/kidney_reward/mean": 1.8429230451583862, "rewards/kidney_reward/std": 1.2928483486175537, "rewards/length2tails_reward/mean": 0.5630030632019043, "rewards/length2tails_reward/std": 0.3994128406047821, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.947972059249878, "rewards/thermo_reward/std": 2.4602742195129395, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.09375, "completions/mean_terminated_length": 271.09375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10700604878365993, "epoch": 1.046, "frac_reward_zero_std": 0.0, "grad_norm": 0.5880448818206787, "learning_rate": 1.8223670265371328e-06, "loss": -0.0068, "num_tokens": 4541404.0, "reward": 11.866726875305176, "reward_std": 5.31985330581665, "rewards/fitness_reward/mean": 6.681757926940918, "rewards/fitness_reward/std": 2.685328483581543, "rewards/kidney_reward/mean": 2.2746706008911133, "rewards/kidney_reward/std": 1.2622480392456055, "rewards/length2tails_reward/mean": 0.6555532217025757, "rewards/length2tails_reward/std": 0.3372742831707001, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.7447428703308105, "rewards/thermo_reward/std": 2.023343563079834, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.3125, "completions/mean_terminated_length": 273.3125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.1021508201956749, "epoch": 1.048, "frac_reward_zero_std": 0.0, "grad_norm": 0.07593434303998947, "learning_rate": 1.8216367864646836e-06, "loss": -0.0024, "num_tokens": 4550182.0, "reward": 10.609359741210938, "reward_std": 6.6359124183654785, "rewards/fitness_reward/mean": 6.591131210327148, "rewards/fitness_reward/std": 2.8076703548431396, "rewards/kidney_reward/mean": 1.8723961114883423, "rewards/kidney_reward/std": 1.6987696886062622, "rewards/length2tails_reward/mean": 0.7949404716491699, "rewards/length2tails_reward/std": 0.2707257866859436, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.9663383960723877, "rewards/thermo_reward/std": 2.863776206970215, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.71875, "completions/mean_terminated_length": 271.71875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09915499482303858, "epoch": 1.05, "frac_reward_zero_std": 0.0, "grad_norm": 0.14689137041568756, "learning_rate": 1.8209051954169853e-06, "loss": 0.0008, "num_tokens": 4558909.0, "reward": 11.72974681854248, "reward_std": 5.813907146453857, "rewards/fitness_reward/mean": 6.727794170379639, "rewards/fitness_reward/std": 2.492391586303711, "rewards/kidney_reward/mean": 2.1069676876068115, "rewards/kidney_reward/std": 1.3520479202270508, "rewards/length2tails_reward/mean": 0.6884158253669739, "rewards/length2tails_reward/std": 0.3286331295967102, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.7261428833007812, "rewards/thermo_reward/std": 2.316819906234741, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.34375, "completions/mean_terminated_length": 269.34375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 0.10628684982657433, "epoch": 1.052, "frac_reward_zero_std": 0.0, "grad_norm": 0.06357128173112869, "learning_rate": 1.8201722545969557e-06, "loss": -0.0012, "num_tokens": 4567560.0, "reward": 12.3041353225708, "reward_std": 3.4844396114349365, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.4296531677246094, "rewards/kidney_reward/std": 0.5578335523605347, "rewards/length2tails_reward/mean": 0.6693383455276489, "rewards/length2tails_reward/std": 0.30575159192085266, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.654494285583496, "rewards/thermo_reward/std": 1.891273856163025, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.25, "completions/mean_terminated_length": 271.25, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10698442067950964, "epoch": 1.054, "frac_reward_zero_std": 0.0, "grad_norm": 0.1364608258008957, "learning_rate": 1.8194379652097318e-06, "loss": 0.0019, "num_tokens": 4576272.0, "reward": 11.943489074707031, "reward_std": 3.0079591274261475, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.3153510093688965, "rewards/kidney_reward/std": 0.8190693855285645, "rewards/length2tails_reward/mean": 0.7093468308448792, "rewards/length2tails_reward/std": 0.28690895438194275, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.1535279750823975, "rewards/thermo_reward/std": 2.2502543926239014, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.11601835023611784, "epoch": 1.056, "frac_reward_zero_std": 0.0, "grad_norm": 0.23851068317890167, "learning_rate": 1.8187023284626676e-06, "loss": 0.0002, "num_tokens": 4585036.0, "reward": 13.2262544631958, "reward_std": 1.4058492183685303, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4896838665008545, "rewards/kidney_reward/std": 0.3229711949825287, "rewards/length2tails_reward/mean": 0.7839996218681335, "rewards/length2tails_reward/std": 0.2844403088092804, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1969854831695557, "rewards/thermo_reward/std": 1.1832919120788574, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.96875, "completions/mean_terminated_length": 272.96875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11103155184537172, "epoch": 1.058, "frac_reward_zero_std": 0.0, "grad_norm": 0.16242334246635437, "learning_rate": 1.8179653455653337e-06, "loss": -0.0042, "num_tokens": 4593803.0, "reward": 10.842390060424805, "reward_std": 6.011104583740234, "rewards/fitness_reward/mean": 6.547800064086914, "rewards/fitness_reward/std": 2.580008029937744, "rewards/kidney_reward/mean": 1.922468662261963, "rewards/kidney_reward/std": 1.644716501235962, "rewards/length2tails_reward/mean": 0.7669105529785156, "rewards/length2tails_reward/std": 0.3401075005531311, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.195430278778076, "rewards/thermo_reward/std": 2.555251359939575, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.6875, "completions/mean_terminated_length": 271.6875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10503857117146254, "epoch": 1.06, "frac_reward_zero_std": 0.0, "grad_norm": 0.08899249881505966, "learning_rate": 1.8172270177295124e-06, "loss": -0.0073, "num_tokens": 4602529.0, "reward": 12.144147872924805, "reward_std": 3.435709238052368, "rewards/fitness_reward/mean": 7.050104141235352, "rewards/fitness_reward/std": 1.7597377300262451, "rewards/kidney_reward/mean": 2.4563241004943848, "rewards/kidney_reward/std": 0.5482203960418701, "rewards/length2tails_reward/mean": 0.691953182220459, "rewards/length2tails_reward/std": 0.35609135031700134, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.4685235023498535, "rewards/thermo_reward/std": 1.832617163658142, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.9375, "completions/mean_terminated_length": 270.9375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.0973681602627039, "epoch": 1.062, "frac_reward_zero_std": 0.0, "grad_norm": 0.05948628485202789, "learning_rate": 1.8164873461691986e-06, "loss": -0.0036, "num_tokens": 4611231.0, "reward": 11.227486610412598, "reward_std": 5.46626091003418, "rewards/fitness_reward/mean": 6.693514823913574, "rewards/fitness_reward/std": 2.635545253753662, "rewards/kidney_reward/mean": 2.0863637924194336, "rewards/kidney_reward/std": 1.2824519872665405, "rewards/length2tails_reward/mean": 0.6062266826629639, "rewards/length2tails_reward/std": 0.4095900058746338, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.286984920501709, "rewards/thermo_reward/std": 2.25620174407959, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.53125, "completions/mean_terminated_length": 272.53125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10592686384916306, "epoch": 1.064, "frac_reward_zero_std": 0.0, "grad_norm": 0.07250142097473145, "learning_rate": 1.8157463321005966e-06, "loss": -0.0022, "num_tokens": 4619984.0, "reward": 10.381654739379883, "reward_std": 6.290080547332764, "rewards/fitness_reward/mean": 6.5366058349609375, "rewards/fitness_reward/std": 2.837371587753296, "rewards/kidney_reward/mean": 1.9228678941726685, "rewards/kidney_reward/std": 1.5469324588775635, "rewards/length2tails_reward/mean": 0.7551746368408203, "rewards/length2tails_reward/std": 0.28403139114379883, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.7466638088226318, "rewards/thermo_reward/std": 2.5262794494628906, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.0, "completions/mean_terminated_length": 272.0, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11166021507233381, "epoch": 1.066, "frac_reward_zero_std": 0.0, "grad_norm": 0.08945536613464355, "learning_rate": 1.8150039767421178e-06, "loss": -0.0124, "num_tokens": 4628720.0, "reward": 10.623258590698242, "reward_std": 5.707748889923096, "rewards/fitness_reward/mean": 6.571508407592773, "rewards/fitness_reward/std": 2.4442975521087646, "rewards/kidney_reward/mean": 1.9549903869628906, "rewards/kidney_reward/std": 1.4934356212615967, "rewards/length2tails_reward/mean": 0.6912268996238708, "rewards/length2tails_reward/std": 0.355633020401001, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.927636742591858, "rewards/thermo_reward/std": 2.6435155868530273, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.3125, "completions/mean_terminated_length": 270.3125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10576423350721598, "epoch": 1.068, "frac_reward_zero_std": 0.0, "grad_norm": 0.08647046983242035, "learning_rate": 1.8142602813143784e-06, "loss": -0.0035, "num_tokens": 4637402.0, "reward": 12.811609268188477, "reward_std": 2.2232515811920166, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5006520748138428, "rewards/kidney_reward/std": 0.5732922554016113, "rewards/length2tails_reward/mean": 0.6195791363716125, "rewards/length2tails_reward/std": 0.3544762432575226, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.787814140319824, "rewards/thermo_reward/std": 1.824695348739624, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.375, "completions/mean_terminated_length": 270.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10622578114271164, "epoch": 1.07, "frac_reward_zero_std": 0.0, "grad_norm": 0.1089043915271759, "learning_rate": 1.8135152470401996e-06, "loss": -0.0039, "num_tokens": 4646086.0, "reward": 9.206745147705078, "reward_std": 8.957849502563477, "rewards/fitness_reward/mean": 5.844204902648926, "rewards/fitness_reward/std": 4.084497928619385, "rewards/kidney_reward/mean": 1.6340656280517578, "rewards/kidney_reward/std": 2.2367794513702393, "rewards/length2tails_reward/mean": 0.6045545339584351, "rewards/length2tails_reward/std": 0.37398043274879456, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.5680193901062012, "rewards/thermo_reward/std": 3.0591702461242676, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.90625, "completions/mean_terminated_length": 269.90625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.08916517347097397, "epoch": 1.072, "frac_reward_zero_std": 0.0, "grad_norm": 0.08490046858787537, "learning_rate": 1.8127688751446026e-06, "loss": -0.0052, "num_tokens": 4654755.0, "reward": 11.566045761108398, "reward_std": 4.726414680480957, "rewards/fitness_reward/mean": 6.572394371032715, "rewards/fitness_reward/std": 2.440897226333618, "rewards/kidney_reward/mean": 2.2182819843292236, "rewards/kidney_reward/std": 0.9536340832710266, "rewards/length2tails_reward/mean": 0.5882750153541565, "rewards/length2tails_reward/std": 0.3553884029388428, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 2.6227917671203613, "rewards/thermo_reward/std": 1.9189839363098145, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.8125, "completions/mean_terminated_length": 271.8125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11149153485894203, "epoch": 1.074, "frac_reward_zero_std": 0.0, "grad_norm": 0.09485744684934616, "learning_rate": 1.8120211668548086e-06, "loss": -0.0049, "num_tokens": 4663485.0, "reward": 12.444345474243164, "reward_std": 2.8540360927581787, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.3471357822418213, "rewards/kidney_reward/std": 0.7977660298347473, "rewards/length2tails_reward/mean": 0.7017210721969604, "rewards/length2tails_reward/std": 0.3152361214160919, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.623361587524414, "rewards/thermo_reward/std": 2.011770009994507, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.0, "completions/mean_terminated_length": 272.0, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.1078566275537014, "epoch": 1.076, "frac_reward_zero_std": 0.0, "grad_norm": 0.06216276437044144, "learning_rate": 1.8112721234002357e-06, "loss": -0.0054, "num_tokens": 4672221.0, "reward": 13.001523971557617, "reward_std": 1.9588590860366821, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.517043113708496, "rewards/kidney_reward/std": 0.2941751182079315, "rewards/length2tails_reward/mean": 0.7192487716674805, "rewards/length2tails_reward/std": 0.333200067281723, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.951371192932129, "rewards/thermo_reward/std": 1.7540698051452637, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 271.1875, "completions/mean_terminated_length": 271.1875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09430662263184786, "epoch": 1.078, "frac_reward_zero_std": 0.0, "grad_norm": 0.5271546244621277, "learning_rate": 1.810521746012498e-06, "loss": 0.0065, "num_tokens": 4680931.0, "reward": 12.489099502563477, "reward_std": 4.14736795425415, "rewards/fitness_reward/mean": 7.024723052978516, "rewards/fitness_reward/std": 1.9033170938491821, "rewards/kidney_reward/mean": 2.435086250305176, "rewards/kidney_reward/std": 0.9357979893684387, "rewards/length2tails_reward/mean": 0.623183012008667, "rewards/length2tails_reward/std": 0.3528957962989807, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8669726848602295, "rewards/thermo_reward/std": 1.8115577697753906, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.5, "completions/mean_terminated_length": 271.5, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11281314957886934, "epoch": 1.08, "frac_reward_zero_std": 0.0, "grad_norm": 0.09693945199251175, "learning_rate": 1.8097700359254024e-06, "loss": -0.0003, "num_tokens": 4689651.0, "reward": 12.46048355102539, "reward_std": 2.768707752227783, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.3303580284118652, "rewards/kidney_reward/std": 0.8740337491035461, "rewards/length2tails_reward/mean": 0.6922803521156311, "rewards/length2tails_reward/std": 0.3110790252685547, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.599713087081909, "rewards/thermo_reward/std": 2.1388442516326904, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.84375, "completions/mean_terminated_length": 271.84375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10293409042060375, "epoch": 1.082, "frac_reward_zero_std": 0.0, "grad_norm": 0.06482190638780594, "learning_rate": 1.8090169943749474e-06, "loss": -0.0045, "num_tokens": 4698382.0, "reward": 12.603124618530273, "reward_std": 3.6950109004974365, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.3166470527648926, "rewards/kidney_reward/std": 0.8981736898422241, "rewards/length2tails_reward/mean": 0.7382228374481201, "rewards/length2tails_reward/std": 0.30910226702690125, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0596022605895996, "rewards/thermo_reward/std": 1.7408092021942139, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.5625, "completions/mean_terminated_length": 271.5625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10494351293891668, "epoch": 1.084, "frac_reward_zero_std": 0.0, "grad_norm": 0.08620021492242813, "learning_rate": 1.8082626225993205e-06, "loss": 0.0015, "num_tokens": 4707104.0, "reward": 10.572208404541016, "reward_std": 6.045322895050049, "rewards/fitness_reward/mean": 6.605555534362793, "rewards/fitness_reward/std": 2.77128529548645, "rewards/kidney_reward/mean": 2.061997413635254, "rewards/kidney_reward/std": 1.4656015634536743, "rewards/length2tails_reward/mean": 0.6626001596450806, "rewards/length2tails_reward/std": 0.3383328318595886, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.7383958101272583, "rewards/thermo_reward/std": 2.649104118347168, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.0, "completions/mean_terminated_length": 271.0, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.10786995757371187, "epoch": 1.086, "frac_reward_zero_std": 0.0, "grad_norm": 0.0641787126660347, "learning_rate": 1.8075069218388962e-06, "loss": 0.0015, "num_tokens": 4715808.0, "reward": 12.383892059326172, "reward_std": 4.372714042663574, "rewards/fitness_reward/mean": 7.013314723968506, "rewards/fitness_reward/std": 1.9678521156311035, "rewards/kidney_reward/mean": 2.3276991844177246, "rewards/kidney_reward/std": 0.9477407336235046, "rewards/length2tails_reward/mean": 0.6996359825134277, "rewards/length2tails_reward/std": 0.27757784724235535, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.872915267944336, "rewards/thermo_reward/std": 1.9278887510299683, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.71875, "completions/mean_terminated_length": 270.71875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10373136959969997, "epoch": 1.088, "frac_reward_zero_std": 0.0, "grad_norm": 0.08191133290529251, "learning_rate": 1.8067498933362355e-06, "loss": -0.0008, "num_tokens": 4724503.0, "reward": 12.963687896728516, "reward_std": 2.7823119163513184, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.405520439147949, "rewards/kidney_reward/std": 0.8700054883956909, "rewards/length2tails_reward/mean": 0.6506054997444153, "rewards/length2tails_reward/std": 0.36144185066223145, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0319223403930664, "rewards/thermo_reward/std": 2.0205771923065186, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.03125, "completions/mean_terminated_length": 271.03125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10066128429025412, "epoch": 1.09, "frac_reward_zero_std": 0.0, "grad_norm": 0.07312504202127457, "learning_rate": 1.8059915383360806e-06, "loss": -0.0055, "num_tokens": 4733208.0, "reward": 12.294307708740234, "reward_std": 3.056042432785034, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.350677251815796, "rewards/kidney_reward/std": 0.8858228921890259, "rewards/length2tails_reward/mean": 0.6339554786682129, "rewards/length2tails_reward/std": 0.3406355082988739, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.4765591621398926, "rewards/thermo_reward/std": 2.209420680999756, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.12119703646749258, "epoch": 1.092, "frac_reward_zero_std": 0.0, "grad_norm": 0.08009244501590729, "learning_rate": 1.805231858085356e-06, "loss": -0.002, "num_tokens": 4741972.0, "reward": 12.57183837890625, "reward_std": 1.7864048480987549, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.517043113708496, "rewards/kidney_reward/std": 0.2941751182079315, "rewards/length2tails_reward/mean": 0.7900687456130981, "rewards/length2tails_reward/std": 0.25023266673088074, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.514604091644287, "rewards/thermo_reward/std": 1.6984219551086426, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.53125, "completions/mean_terminated_length": 271.53125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10524578206241131, "epoch": 1.094, "frac_reward_zero_std": 0.0, "grad_norm": 0.0970984622836113, "learning_rate": 1.8044708538331654e-06, "loss": -0.0047, "num_tokens": 4750693.0, "reward": 12.265960693359375, "reward_std": 3.2419352531433105, "rewards/fitness_reward/mean": 6.930270195007324, "rewards/fitness_reward/std": 2.1268088817596436, "rewards/kidney_reward/mean": 2.4896838665008545, "rewards/kidney_reward/std": 0.3229711949825287, "rewards/length2tails_reward/mean": 0.6985119581222534, "rewards/length2tails_reward/std": 0.33531445264816284, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.6761550903320312, "rewards/thermo_reward/std": 1.9302542209625244, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.75, "completions/mean_terminated_length": 270.75, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10378769133239985, "epoch": 1.096, "frac_reward_zero_std": 0.0, "grad_norm": 0.06036684662103653, "learning_rate": 1.8037085268307885e-06, "loss": -0.0035, "num_tokens": 4759389.0, "reward": 13.266702651977539, "reward_std": 1.3902013301849365, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.6663521528244019, "rewards/length2tails_reward/std": 0.30723318457603455, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1671204566955566, "rewards/thermo_reward/std": 1.2469242811203003, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 270.15625, "completions/mean_terminated_length": 270.15625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10272781364619732, "epoch": 1.098, "frac_reward_zero_std": 0.0, "grad_norm": 0.2102782130241394, "learning_rate": 1.8029448783316813e-06, "loss": -0.0062, "num_tokens": 4768066.0, "reward": 11.990425109863281, "reward_std": 5.225229740142822, "rewards/fitness_reward/mean": 6.418025493621826, "rewards/fitness_reward/std": 2.9806253910064697, "rewards/kidney_reward/mean": 2.3426103591918945, "rewards/kidney_reward/std": 0.9024714231491089, "rewards/length2tails_reward/mean": 0.600758969783783, "rewards/length2tails_reward/std": 0.387471467256546, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.069714069366455, "rewards/thermo_reward/std": 1.4123339653015137, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.6875, "completions/mean_terminated_length": 271.6875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09544859733432531, "epoch": 1.1, "frac_reward_zero_std": 0.0, "grad_norm": 0.07070183753967285, "learning_rate": 1.8021799095914708e-06, "loss": -0.0042, "num_tokens": 4776792.0, "reward": 12.756484985351562, "reward_std": 2.2132277488708496, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.503726005554199, "rewards/kidney_reward/std": 0.5565682649612427, "rewards/length2tails_reward/mean": 0.643738865852356, "rewards/length2tails_reward/std": 0.368051677942276, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.7272000312805176, "rewards/thermo_reward/std": 1.876189947128296, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.4375, "completions/mean_terminated_length": 271.4375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.1080935113132, "epoch": 1.102, "frac_reward_zero_std": 0.0, "grad_norm": 0.10448921471834183, "learning_rate": 1.8014136218679566e-06, "loss": 0.0011, "num_tokens": 4785510.0, "reward": 12.542806625366211, "reward_std": 2.689107894897461, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.4469244480133057, "rewards/kidney_reward/std": 0.7286657691001892, "rewards/length2tails_reward/mean": 0.6926784515380859, "rewards/length2tails_reward/std": 0.2712414860725403, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.622939109802246, "rewards/thermo_reward/std": 1.9818955659866333, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.6875, "completions/mean_terminated_length": 272.6875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10477001033723354, "epoch": 1.104, "frac_reward_zero_std": 0.0, "grad_norm": 0.1856602132320404, "learning_rate": 1.800646016421106e-06, "loss": -0.0055, "num_tokens": 4794268.0, "reward": 12.144149780273438, "reward_std": 4.565323352813721, "rewards/fitness_reward/mean": 6.996285438537598, "rewards/fitness_reward/std": 2.064185857772827, "rewards/kidney_reward/mean": 2.3954811096191406, "rewards/kidney_reward/std": 1.0103658437728882, "rewards/length2tails_reward/mean": 0.7687793970108032, "rewards/length2tails_reward/std": 0.2617262005805969, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.5755057334899902, "rewards/thermo_reward/std": 2.078153610229492, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.3125, "completions/mean_terminated_length": 272.3125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09166521392762661, "epoch": 1.106, "frac_reward_zero_std": 0.0, "grad_norm": 0.08722712099552155, "learning_rate": 1.7998770945130538e-06, "loss": -0.0095, "num_tokens": 4803014.0, "reward": 11.275504112243652, "reward_std": 4.791029930114746, "rewards/fitness_reward/mean": 6.587876796722412, "rewards/fitness_reward/std": 2.605294704437256, "rewards/kidney_reward/mean": 2.1699628829956055, "rewards/kidney_reward/std": 1.2778137922286987, "rewards/length2tails_reward/mean": 0.6810904741287231, "rewards/length2tails_reward/std": 0.37631070613861084, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.349555730819702, "rewards/thermo_reward/std": 2.1994354724884033, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.71875, "completions/mean_terminated_length": 271.71875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.1041144272312522, "epoch": 1.108, "frac_reward_zero_std": 0.0, "grad_norm": 0.07147329300642014, "learning_rate": 1.7991068574080986e-06, "loss": -0.0033, "num_tokens": 4811741.0, "reward": 11.137859344482422, "reward_std": 5.557100772857666, "rewards/fitness_reward/mean": 6.831142425537109, "rewards/fitness_reward/std": 2.1162006855010986, "rewards/kidney_reward/mean": 2.0171127319335938, "rewards/kidney_reward/std": 1.558064341545105, "rewards/length2tails_reward/mean": 0.6767317056655884, "rewards/length2tails_reward/std": 0.36727866530418396, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.1219303607940674, "rewards/thermo_reward/std": 2.5796494483947754, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.96875, "completions/mean_terminated_length": 270.96875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10824454110115767, "epoch": 1.11, "frac_reward_zero_std": 0.0, "grad_norm": 0.0778418481349945, "learning_rate": 1.7983353063727014e-06, "loss": -0.0079, "num_tokens": 4820444.0, "reward": 13.1099853515625, "reward_std": 1.6137299537658691, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.536646842956543, "rewards/kidney_reward/std": 0.5081712007522583, "rewards/length2tails_reward/mean": 0.694353461265564, "rewards/length2tails_reward/std": 0.31348589062690735, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.042717933654785, "rewards/thermo_reward/std": 1.2481979131698608, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 270.5625, "completions/mean_terminated_length": 270.5625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09657656960189342, "epoch": 1.112, "frac_reward_zero_std": 0.0, "grad_norm": 0.10516510903835297, "learning_rate": 1.7975624426754845e-06, "loss": -0.0027, "num_tokens": 4829134.0, "reward": 11.269773483276367, "reward_std": 6.188965797424316, "rewards/fitness_reward/mean": 6.2905683517456055, "rewards/fitness_reward/std": 3.396219253540039, "rewards/kidney_reward/mean": 2.1874663829803467, "rewards/kidney_reward/std": 1.343290090560913, "rewards/length2tails_reward/mean": 0.6322581171989441, "rewards/length2tails_reward/std": 0.3359857499599457, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.628514051437378, "rewards/thermo_reward/std": 1.8966007232666016, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.40625, "completions/mean_terminated_length": 271.40625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11595190409570932, "epoch": 1.114, "frac_reward_zero_std": 0.0, "grad_norm": 0.1653381735086441, "learning_rate": 1.7967882675872276e-06, "loss": 0.0036, "num_tokens": 4837851.0, "reward": 11.56589126586914, "reward_std": 5.274896144866943, "rewards/fitness_reward/mean": 6.966367721557617, "rewards/fitness_reward/std": 2.2334251403808594, "rewards/kidney_reward/mean": 2.231813430786133, "rewards/kidney_reward/std": 1.3505194187164307, "rewards/length2tails_reward/mean": 0.6987495422363281, "rewards/length2tails_reward/std": 0.3107140362262726, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.1978354454040527, "rewards/thermo_reward/std": 2.329174518585205, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.40625, "completions/mean_terminated_length": 272.40625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11444343067705631, "epoch": 1.116, "frac_reward_zero_std": 0.0, "grad_norm": 0.08911436796188354, "learning_rate": 1.7960127823808678e-06, "loss": 0.0022, "num_tokens": 4846600.0, "reward": 11.401962280273438, "reward_std": 5.1144914627075195, "rewards/fitness_reward/mean": 6.89082145690918, "rewards/fitness_reward/std": 2.0927488803863525, "rewards/kidney_reward/mean": 2.1250972747802734, "rewards/kidney_reward/std": 1.3569532632827759, "rewards/length2tails_reward/mean": 0.7734675407409668, "rewards/length2tails_reward/std": 0.2523795962333679, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.2086966037750244, "rewards/thermo_reward/std": 2.2721035480499268, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.65625, "completions/mean_terminated_length": 271.65625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10886608902364969, "epoch": 1.1179999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.08465147018432617, "learning_rate": 1.7952359883314953e-06, "loss": -0.006, "num_tokens": 4855325.0, "reward": 12.46010684967041, "reward_std": 2.8142449855804443, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.334458827972412, "rewards/kidney_reward/std": 0.851756751537323, "rewards/length2tails_reward/mean": 0.7321093082427979, "rewards/length2tails_reward/std": 0.2910517752170563, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.591252088546753, "rewards/thermo_reward/std": 2.0499556064605713, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.59375, "completions/mean_terminated_length": 272.59375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.09685478266328573, "epoch": 1.12, "frac_reward_zero_std": 0.0, "grad_norm": 0.09345784038305283, "learning_rate": 1.7944578867163528e-06, "loss": -0.0012, "num_tokens": 4864080.0, "reward": 11.20602035522461, "reward_std": 6.437448978424072, "rewards/fitness_reward/mean": 6.544711589813232, "rewards/fitness_reward/std": 3.0064234733581543, "rewards/kidney_reward/mean": 2.0772125720977783, "rewards/kidney_reward/std": 1.6922893524169922, "rewards/length2tails_reward/mean": 0.7151135802268982, "rewards/length2tails_reward/std": 0.3349017798900604, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.4125847816467285, "rewards/thermo_reward/std": 2.328195571899414, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 271.5, "completions/mean_terminated_length": 271.5, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0986628895625472, "epoch": 1.1219999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.09137522429227829, "learning_rate": 1.7936784788148325e-06, "loss": -0.0059, "num_tokens": 4872800.0, "reward": 12.53663158416748, "reward_std": 3.1190147399902344, "rewards/fitness_reward/mean": 7.188657283782959, "rewards/fitness_reward/std": 0.5449937582015991, "rewards/kidney_reward/mean": 2.36080002784729, "rewards/kidney_reward/std": 0.7742516994476318, "rewards/length2tails_reward/mean": 0.688651442527771, "rewards/length2tails_reward/std": 0.31537818908691406, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8183093070983887, "rewards/thermo_reward/std": 2.1796393394470215, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.75, "completions/mean_terminated_length": 269.75, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "entropy": 0.11182715278118849, "epoch": 1.124, "frac_reward_zero_std": 0.0, "grad_norm": 0.43471959233283997, "learning_rate": 1.792897765908475e-06, "loss": -0.0146, "num_tokens": 4881464.0, "reward": 9.44644546508789, "reward_std": 8.440580368041992, "rewards/fitness_reward/mean": 6.1534929275512695, "rewards/fitness_reward/std": 3.818556308746338, "rewards/kidney_reward/mean": 1.7774693965911865, "rewards/kidney_reward/std": 2.152588367462158, "rewards/length2tails_reward/mean": 0.6347315907478333, "rewards/length2tails_reward/std": 0.38687554001808167, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.3520097732543945, "rewards/thermo_reward/std": 3.0032525062561035, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.5625, "completions/mean_terminated_length": 270.5625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09960361663252115, "epoch": 1.126, "frac_reward_zero_std": 0.0, "grad_norm": 0.15998654067516327, "learning_rate": 1.792115749280967e-06, "loss": -0.009, "num_tokens": 4890154.0, "reward": 11.679988861083984, "reward_std": 4.114315032958984, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.2523083686828613, "rewards/kidney_reward/std": 0.9392135739326477, "rewards/length2tails_reward/mean": 0.583635687828064, "rewards/length2tails_reward/std": 0.39422547817230225, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.2162628173828125, "rewards/thermo_reward/std": 2.487140655517578, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10299806576222181, "epoch": 1.1280000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.08876167237758636, "learning_rate": 1.791332430218138e-06, "loss": -0.0033, "num_tokens": 4898918.0, "reward": 11.167304992675781, "reward_std": 5.945474624633789, "rewards/fitness_reward/mean": 6.2546796798706055, "rewards/fitness_reward/std": 3.3238136768341064, "rewards/kidney_reward/mean": 2.1046230792999268, "rewards/kidney_reward/std": 1.303746223449707, "rewards/length2tails_reward/mean": 0.7680449485778809, "rewards/length2tails_reward/std": 0.2990765869617462, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.631197929382324, "rewards/thermo_reward/std": 1.9832183122634888, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 269.6875, "completions/mean_terminated_length": 269.6875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09318594913929701, "epoch": 1.13, "frac_reward_zero_std": 0.0, "grad_norm": 0.08641897886991501, "learning_rate": 1.7905478100079596e-06, "loss": -0.0062, "num_tokens": 4907580.0, "reward": 12.202686309814453, "reward_std": 4.635074138641357, "rewards/fitness_reward/mean": 6.691686153411865, "rewards/fitness_reward/std": 2.6358587741851807, "rewards/kidney_reward/mean": 2.3179144859313965, "rewards/kidney_reward/std": 0.9202808141708374, "rewards/length2tails_reward/mean": 0.5614801645278931, "rewards/length2tails_reward/std": 0.38135483860969543, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.036937713623047, "rewards/thermo_reward/std": 1.7109110355377197, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.0, "completions/mean_terminated_length": 271.0, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10470135603100061, "epoch": 1.1320000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.05364145338535309, "learning_rate": 1.7897618899405421e-06, "loss": -0.0047, "num_tokens": 4916284.0, "reward": 13.045207023620605, "reward_std": 2.1981866359710693, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.508059501647949, "rewards/kidney_reward/std": 0.5330638289451599, "rewards/length2tails_reward/mean": 0.6730586290359497, "rewards/length2tails_reward/std": 0.27641263604164124, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0086560249328613, "rewards/thermo_reward/std": 1.7391635179519653, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.34375, "completions/mean_terminated_length": 271.34375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0999163817614317, "epoch": 1.134, "frac_reward_zero_std": 0.0, "grad_norm": 0.08887635916471481, "learning_rate": 1.7889746713081341e-06, "loss": -0.0018, "num_tokens": 4924999.0, "reward": 11.056597709655762, "reward_std": 6.2495503425598145, "rewards/fitness_reward/mean": 6.679024696350098, "rewards/fitness_reward/std": 2.6855037212371826, "rewards/kidney_reward/mean": 1.9901930093765259, "rewards/kidney_reward/std": 1.545816421508789, "rewards/length2tails_reward/mean": 0.658514678478241, "rewards/length2tails_reward/std": 0.3207051753997803, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.2215282917022705, "rewards/thermo_reward/std": 2.6121809482574463, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.625, "completions/mean_terminated_length": 271.625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10158183518797159, "epoch": 1.1360000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.06434900313615799, "learning_rate": 1.7881861554051184e-06, "loss": 0.0011, "num_tokens": 4933723.0, "reward": 12.243963241577148, "reward_std": 4.706650733947754, "rewards/fitness_reward/mean": 6.951861381530762, "rewards/fitness_reward/std": 2.006192445755005, "rewards/kidney_reward/mean": 2.3538095951080322, "rewards/kidney_reward/std": 1.1005569696426392, "rewards/length2tails_reward/mean": 0.7174511551856995, "rewards/length2tails_reward/std": 0.26794394850730896, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.766547918319702, "rewards/thermo_reward/std": 1.9785237312316895, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.96875, "completions/mean_terminated_length": 270.96875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10938147269189358, "epoch": 1.138, "frac_reward_zero_std": 0.0, "grad_norm": 0.08531554788351059, "learning_rate": 1.787396343528012e-06, "loss": -0.0029, "num_tokens": 4942426.0, "reward": 12.526836395263672, "reward_std": 3.3885457515716553, "rewards/fitness_reward/mean": 6.98751163482666, "rewards/fitness_reward/std": 2.113816022872925, "rewards/kidney_reward/mean": 2.41365909576416, "rewards/kidney_reward/std": 0.7694591879844666, "rewards/length2tails_reward/mean": 0.6483283042907715, "rewards/length2tails_reward/std": 0.34616991877555847, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9608325958251953, "rewards/thermo_reward/std": 1.7782973051071167, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.1875, "completions/mean_terminated_length": 271.1875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.1083859596401453, "epoch": 1.1400000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.09535173326730728, "learning_rate": 1.7866052369754613e-06, "loss": 0.0008, "num_tokens": 4951136.0, "reward": 11.748469352722168, "reward_std": 4.731593608856201, "rewards/fitness_reward/mean": 6.976041316986084, "rewards/fitness_reward/std": 1.871350884437561, "rewards/kidney_reward/mean": 2.1608309745788574, "rewards/kidney_reward/std": 1.228219747543335, "rewards/length2tails_reward/mean": 0.6496375799179077, "rewards/length2tails_reward/std": 0.3468906879425049, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.4466333389282227, "rewards/thermo_reward/std": 2.20149827003479, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.5, "completions/mean_terminated_length": 271.5, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10877596493810415, "epoch": 1.142, "frac_reward_zero_std": 0.0, "grad_norm": 0.08137992769479752, "learning_rate": 1.7858128370482423e-06, "loss": -0.0058, "num_tokens": 4959856.0, "reward": 12.81321907043457, "reward_std": 1.8809332847595215, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.6763936281204224, "rewards/length2tails_reward/std": 0.36004820466041565, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.712632894515991, "rewards/thermo_reward/std": 1.7976608276367188, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.28125, "completions/mean_terminated_length": 272.28125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11456997133791447, "epoch": 1.144, "frac_reward_zero_std": 0.0, "grad_norm": 0.07577666640281677, "learning_rate": 1.785019145049258e-06, "loss": -0.0032, "num_tokens": 4968601.0, "reward": 10.718204498291016, "reward_std": 6.012451171875, "rewards/fitness_reward/mean": 6.324345588684082, "rewards/fitness_reward/std": 3.2866885662078857, "rewards/kidney_reward/mean": 2.104271411895752, "rewards/kidney_reward/std": 1.3013216257095337, "rewards/length2tails_reward/mean": 0.6984899044036865, "rewards/length2tails_reward/std": 0.34832921624183655, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.119739532470703, "rewards/thermo_reward/std": 2.407792091369629, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 270.96875, "completions/mean_terminated_length": 270.96875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11358758341521025, "epoch": 1.146, "frac_reward_zero_std": 0.0, "grad_norm": 0.06758356094360352, "learning_rate": 1.7842241622835354e-06, "loss": 0.002, "num_tokens": 4977304.0, "reward": 12.825021743774414, "reward_std": 1.961801290512085, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4824347496032715, "rewards/kidney_reward/std": 0.5400055050849915, "rewards/length2tails_reward/mean": 0.6755943298339844, "rewards/length2tails_reward/std": 0.3082455098628998, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.813843250274658, "rewards/thermo_reward/std": 1.6336705684661865, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.25, "completions/mean_terminated_length": 270.25, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10494345147162676, "epoch": 1.148, "frac_reward_zero_std": 0.0, "grad_norm": 0.09451685100793839, "learning_rate": 1.7834278900582237e-06, "loss": 0.0028, "num_tokens": 4985984.0, "reward": 12.46784496307373, "reward_std": 4.835677623748779, "rewards/fitness_reward/mean": 7.028944969177246, "rewards/fitness_reward/std": 1.8794351816177368, "rewards/kidney_reward/mean": 2.3567276000976562, "rewards/kidney_reward/std": 1.2254520654678345, "rewards/length2tails_reward/mean": 0.6347229480743408, "rewards/length2tails_reward/std": 0.34789660573005676, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9186999797821045, "rewards/thermo_reward/std": 1.9678254127502441, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.40625, "completions/mean_terminated_length": 272.40625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11180542130023241, "epoch": 1.15, "frac_reward_zero_std": 0.0, "grad_norm": 0.08369499444961548, "learning_rate": 1.7826303296825924e-06, "loss": 0.0009, "num_tokens": 4994733.0, "reward": 12.160469055175781, "reward_std": 4.53151798248291, "rewards/fitness_reward/mean": 6.954615116119385, "rewards/fitness_reward/std": 1.9908239841461182, "rewards/kidney_reward/mean": 2.3553924560546875, "rewards/kidney_reward/std": 0.9413848519325256, "rewards/length2tails_reward/mean": 0.7666806578636169, "rewards/length2tails_reward/std": 0.2807237207889557, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.6737937927246094, "rewards/thermo_reward/std": 2.1368064880371094, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.6875, "completions/mean_terminated_length": 270.6875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09493160434067249, "epoch": 1.152, "frac_reward_zero_std": 0.0, "grad_norm": 0.07596372812986374, "learning_rate": 1.7818314824680298e-06, "loss": -0.0025, "num_tokens": 5003427.0, "reward": 12.270533561706543, "reward_std": 1.898982048034668, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.517043113708496, "rewards/kidney_reward/std": 0.2941751182079315, "rewards/length2tails_reward/mean": 0.5976208448410034, "rewards/length2tails_reward/std": 0.35658320784568787, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.232543706893921, "rewards/thermo_reward/std": 1.8339343070983887, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.125, "completions/mean_terminated_length": 270.125, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.10535383597016335, "epoch": 1.154, "frac_reward_zero_std": 0.0, "grad_norm": 0.16355304419994354, "learning_rate": 1.7810313497280387e-06, "loss": 0.0026, "num_tokens": 5012103.0, "reward": 12.154855728149414, "reward_std": 5.096898078918457, "rewards/fitness_reward/mean": 7.000971794128418, "rewards/fitness_reward/std": 2.0376741886138916, "rewards/kidney_reward/mean": 2.354713201522827, "rewards/kidney_reward/std": 1.2366715669631958, "rewards/length2tails_reward/mean": 0.6913293600082397, "rewards/length2tails_reward/std": 0.3156411349773407, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.630038261413574, "rewards/thermo_reward/std": 2.201185941696167, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.75, "completions/mean_terminated_length": 270.75, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.09658471308648586, "epoch": 1.156, "frac_reward_zero_std": 0.0, "grad_norm": 0.06265582889318466, "learning_rate": 1.7802299327782366e-06, "loss": -0.0059, "num_tokens": 5020799.0, "reward": 11.959808349609375, "reward_std": 3.974642276763916, "rewards/fitness_reward/mean": 6.929236888885498, "rewards/fitness_reward/std": 2.1325860023498535, "rewards/kidney_reward/mean": 2.4361748695373535, "rewards/kidney_reward/std": 0.6509256958961487, "rewards/length2tails_reward/mean": 0.630581259727478, "rewards/length2tails_reward/std": 0.4034130275249481, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.43133807182312, "rewards/thermo_reward/std": 2.2508015632629395, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.84375, "completions/mean_terminated_length": 270.84375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10977741423994303, "epoch": 1.158, "frac_reward_zero_std": 0.0, "grad_norm": 0.21476377546787262, "learning_rate": 1.7794272329363525e-06, "loss": 0.0012, "num_tokens": 5029498.0, "reward": 12.051267623901367, "reward_std": 4.693270683288574, "rewards/fitness_reward/mean": 6.910964488983154, "rewards/fitness_reward/std": 1.9847608804702759, "rewards/kidney_reward/mean": 2.3280601501464844, "rewards/kidney_reward/std": 1.00356924533844, "rewards/length2tails_reward/mean": 0.6109268665313721, "rewards/length2tails_reward/std": 0.32766100764274597, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.651151180267334, "rewards/thermo_reward/std": 2.1728739738464355, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.25, "completions/mean_terminated_length": 271.25, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10096995253115892, "epoch": 1.16, "frac_reward_zero_std": 0.0, "grad_norm": 0.11981163173913956, "learning_rate": 1.7786232515222241e-06, "loss": -0.0082, "num_tokens": 5038210.0, "reward": 11.612472534179688, "reward_std": 5.185047149658203, "rewards/fitness_reward/mean": 6.672616004943848, "rewards/fitness_reward/std": 2.490133762359619, "rewards/kidney_reward/mean": 2.2824039459228516, "rewards/kidney_reward/std": 1.100603461265564, "rewards/length2tails_reward/mean": 0.7170855402946472, "rewards/length2tails_reward/std": 0.3222355544567108, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.4857451915740967, "rewards/thermo_reward/std": 2.2294204235076904, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.1875, "completions/mean_terminated_length": 272.1875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11031336337327957, "epoch": 1.162, "frac_reward_zero_std": 0.0, "grad_norm": 0.06516198813915253, "learning_rate": 1.7778179898577971e-06, "loss": -0.0036, "num_tokens": 5046952.0, "reward": 12.819649696350098, "reward_std": 2.8847804069519043, "rewards/fitness_reward/mean": 7.051357746124268, "rewards/fitness_reward/std": 1.7526482343673706, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7042213678359985, "rewards/length2tails_reward/std": 0.3321356475353241, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0261082649230957, "rewards/thermo_reward/std": 1.4118154048919678, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.6875, "completions/mean_terminated_length": 271.6875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11428037006407976, "epoch": 1.164, "frac_reward_zero_std": 0.0, "grad_norm": 0.09282801300287247, "learning_rate": 1.7770114492671224e-06, "loss": -0.0008, "num_tokens": 5055678.0, "reward": 12.54432487487793, "reward_std": 3.2210092544555664, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.324988842010498, "rewards/kidney_reward/std": 0.9561151266098022, "rewards/length2tails_reward/mean": 0.7320905327796936, "rewards/length2tails_reward/std": 0.30599185824394226, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.742450714111328, "rewards/thermo_reward/std": 2.1522037982940674, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.21875, "completions/mean_terminated_length": 270.21875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09224184788763523, "epoch": 1.166, "frac_reward_zero_std": 0.0, "grad_norm": 0.0960201770067215, "learning_rate": 1.776203631076353e-06, "loss": -0.0066, "num_tokens": 5064357.0, "reward": 11.877236366271973, "reward_std": 5.058732509613037, "rewards/fitness_reward/mean": 6.731173515319824, "rewards/fitness_reward/std": 2.4796977043151855, "rewards/kidney_reward/mean": 2.3305821418762207, "rewards/kidney_reward/std": 0.9962871670722961, "rewards/length2tails_reward/mean": 0.5928375124931335, "rewards/length2tails_reward/std": 0.36502107977867126, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.6561970710754395, "rewards/thermo_reward/std": 2.1421453952789307, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.5625, "completions/mean_terminated_length": 271.5625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.1109061436727643, "epoch": 1.168, "frac_reward_zero_std": 0.0, "grad_norm": 0.09443071484565735, "learning_rate": 1.7753945366137426e-06, "loss": 0.0031, "num_tokens": 5073079.0, "reward": 11.314729690551758, "reward_std": 5.843605041503906, "rewards/fitness_reward/mean": 6.943359851837158, "rewards/fitness_reward/std": 2.3635756969451904, "rewards/kidney_reward/mean": 2.131972074508667, "rewards/kidney_reward/std": 1.5495785474777222, "rewards/length2tails_reward/mean": 0.687659502029419, "rewards/length2tails_reward/std": 0.3384109139442444, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.070632219314575, "rewards/thermo_reward/std": 2.596601724624634, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.4375, "completions/mean_terminated_length": 272.4375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11310446728020906, "epoch": 1.17, "frac_reward_zero_std": 0.0, "grad_norm": 0.11845079064369202, "learning_rate": 1.7745841672096442e-06, "loss": -0.0, "num_tokens": 5081829.0, "reward": 11.575647354125977, "reward_std": 5.9788408279418945, "rewards/fitness_reward/mean": 6.657668590545654, "rewards/fitness_reward/std": 2.790924310684204, "rewards/kidney_reward/mean": 2.1905972957611084, "rewards/kidney_reward/std": 1.4292532205581665, "rewards/length2tails_reward/mean": 0.7536600828170776, "rewards/length2tails_reward/std": 0.29540786147117615, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.552015781402588, "rewards/thermo_reward/std": 2.207545042037964, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.09375, "completions/mean_terminated_length": 271.09375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10994715616106987, "epoch": 1.172, "frac_reward_zero_std": 0.0, "grad_norm": 0.11424069106578827, "learning_rate": 1.7737725241965067e-06, "loss": -0.0041, "num_tokens": 5090536.0, "reward": 11.595703125, "reward_std": 5.179896354675293, "rewards/fitness_reward/mean": 6.970044136047363, "rewards/fitness_reward/std": 1.9047677516937256, "rewards/kidney_reward/mean": 2.108436346054077, "rewards/kidney_reward/std": 1.3753132820129395, "rewards/length2tails_reward/mean": 0.6399242877960205, "rewards/length2tails_reward/std": 0.3654977083206177, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.353229522705078, "rewards/thermo_reward/std": 2.6165781021118164, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 270.71875, "completions/mean_terminated_length": 270.71875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10835985839366913, "epoch": 1.174, "frac_reward_zero_std": 0.0, "grad_norm": 0.10886748135089874, "learning_rate": 1.7729596089088727e-06, "loss": -0.0056, "num_tokens": 5099231.0, "reward": 10.739278793334961, "reward_std": 5.102948188781738, "rewards/fitness_reward/mean": 6.629342079162598, "rewards/fitness_reward/std": 2.4391729831695557, "rewards/kidney_reward/mean": 1.9109983444213867, "rewards/kidney_reward/std": 1.4604812860488892, "rewards/length2tails_reward/mean": 0.6606801748275757, "rewards/length2tails_reward/std": 0.337507039308548, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.0328705310821533, "rewards/thermo_reward/std": 2.385857582092285, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 270.09375, "completions/mean_terminated_length": 270.09375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10174105782061815, "epoch": 1.176, "frac_reward_zero_std": 0.0, "grad_norm": 0.08054821938276291, "learning_rate": 1.7721454226833775e-06, "loss": -0.0052, "num_tokens": 5107906.0, "reward": 12.438180923461914, "reward_std": 3.1698365211486816, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.2543578147888184, "rewards/kidney_reward/std": 1.0256778001785278, "rewards/length2tails_reward/mean": 0.6150473356246948, "rewards/length2tails_reward/std": 0.34308475255966187, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.6611340045928955, "rewards/thermo_reward/std": 2.239284038543701, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.28125, "completions/mean_terminated_length": 271.28125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.1017656335607171, "epoch": 1.178, "frac_reward_zero_std": 0.0, "grad_norm": 0.1287049800157547, "learning_rate": 1.7713299668587457e-06, "loss": -0.004, "num_tokens": 5116619.0, "reward": 13.496252059936523, "reward_std": 1.0705134868621826, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.6924083232879639, "rewards/length2tails_reward/std": 0.3212348222732544, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.394064426422119, "rewards/thermo_reward/std": 0.9870204925537109, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.09375, "completions/mean_terminated_length": 269.09375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "entropy": 0.10302947741001844, "epoch": 1.18, "frac_reward_zero_std": 0.0, "grad_norm": 0.2910614609718323, "learning_rate": 1.7705132427757892e-06, "loss": -0.002, "num_tokens": 5125262.0, "reward": 12.19694995880127, "reward_std": 4.453863143920898, "rewards/fitness_reward/mean": 7.043661117553711, "rewards/fitness_reward/std": 1.7961864471435547, "rewards/kidney_reward/mean": 2.3480801582336426, "rewards/kidney_reward/std": 1.1257485151290894, "rewards/length2tails_reward/mean": 0.6744387149810791, "rewards/length2tails_reward/std": 0.29502642154693604, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.637765407562256, "rewards/thermo_reward/std": 2.0495615005493164, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.375, "completions/mean_terminated_length": 270.375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.08723223209381104, "epoch": 1.182, "frac_reward_zero_std": 0.0, "grad_norm": 0.062244486063718796, "learning_rate": 1.769695251777406e-06, "loss": -0.0048, "num_tokens": 5133946.0, "reward": 11.108282089233398, "reward_std": 6.026435375213623, "rewards/fitness_reward/mean": 6.404687881469727, "rewards/fitness_reward/std": 3.023834705352783, "rewards/kidney_reward/mean": 2.187194585800171, "rewards/kidney_reward/std": 1.2497555017471313, "rewards/length2tails_reward/mean": 0.6100426912307739, "rewards/length2tails_reward/std": 0.38546499609947205, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.355395793914795, "rewards/thermo_reward/std": 2.284048080444336, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 270.59375, "completions/mean_terminated_length": 270.59375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11061716824769974, "epoch": 1.184, "frac_reward_zero_std": 0.0, "grad_norm": 0.1302819401025772, "learning_rate": 1.7688759952085763e-06, "loss": -0.0039, "num_tokens": 5142637.0, "reward": 10.234525680541992, "reward_std": 5.914739608764648, "rewards/fitness_reward/mean": 6.1420488357543945, "rewards/fitness_reward/std": 3.34014630317688, "rewards/kidney_reward/mean": 1.9530357122421265, "rewards/kidney_reward/std": 1.4134089946746826, "rewards/length2tails_reward/mean": 0.6440788507461548, "rewards/length2tails_reward/std": 0.3247174620628357, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 1.9750324487686157, "rewards/thermo_reward/std": 2.403918981552124, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 270.84375, "completions/mean_terminated_length": 270.84375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10884816199541092, "epoch": 1.186, "frac_reward_zero_std": 0.0, "grad_norm": 0.078577421605587, "learning_rate": 1.7680554744163623e-06, "loss": -0.0008, "num_tokens": 5151336.0, "reward": 12.893369674682617, "reward_std": 1.8842889070510864, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4896838665008545, "rewards/kidney_reward/std": 0.3229711949825287, "rewards/length2tails_reward/mean": 0.6638084650039673, "rewards/length2tails_reward/std": 0.34161463379859924, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.876119375228882, "rewards/thermo_reward/std": 1.6256120204925537, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.3125, "completions/mean_terminated_length": 272.3125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10069420002400875, "epoch": 1.188, "frac_reward_zero_std": 0.0, "grad_norm": 0.07682990282773972, "learning_rate": 1.7672336907499037e-06, "loss": -0.0025, "num_tokens": 5160082.0, "reward": 12.909473419189453, "reward_std": 2.9470677375793457, "rewards/fitness_reward/mean": 7.052633285522461, "rewards/fitness_reward/std": 1.7454336881637573, "rewards/kidney_reward/mean": 2.4896838665008545, "rewards/kidney_reward/std": 0.3229711949825287, "rewards/length2tails_reward/mean": 0.7239134311676025, "rewards/length2tails_reward/std": 0.35869836807250977, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.194766044616699, "rewards/thermo_reward/std": 1.3918113708496094, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 270.65625, "completions/mean_terminated_length": 270.65625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11134236957877874, "epoch": 1.19, "frac_reward_zero_std": 0.0, "grad_norm": 0.10810114443302155, "learning_rate": 1.7664106455604174e-06, "loss": 0.0043, "num_tokens": 5168775.0, "reward": 13.395252227783203, "reward_std": 1.441155195236206, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.6862658262252808, "rewards/length2tails_reward/std": 0.27184396982192993, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.26632022857666, "rewards/thermo_reward/std": 1.3266010284423828, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10387864802032709, "epoch": 1.192, "frac_reward_zero_std": 0.0, "grad_norm": 0.06299920380115509, "learning_rate": 1.7655863402011946e-06, "loss": -0.0062, "num_tokens": 5177503.0, "reward": 12.111298561096191, "reward_std": 3.954411268234253, "rewards/fitness_reward/mean": 6.99554443359375, "rewards/fitness_reward/std": 1.7628074884414673, "rewards/kidney_reward/mean": 2.238969326019287, "rewards/kidney_reward/std": 0.9901480674743652, "rewards/length2tails_reward/mean": 0.6856290698051453, "rewards/length2tails_reward/std": 0.3055051565170288, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.708221673965454, "rewards/thermo_reward/std": 1.982776403427124, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.46875, "completions/mean_terminated_length": 270.46875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11133202258497477, "epoch": 1.194, "frac_reward_zero_std": 0.0, "grad_norm": 0.08474539965391159, "learning_rate": 1.7647607760275985e-06, "loss": -0.0039, "num_tokens": 5186190.0, "reward": 13.106220245361328, "reward_std": 1.714718222618103, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.5953869819641113, "rewards/length2tails_reward/std": 0.35357436537742615, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.013735294342041, "rewards/thermo_reward/std": 1.5857094526290894, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.8125, "completions/mean_terminated_length": 270.8125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10823120549321175, "epoch": 1.196, "frac_reward_zero_std": 0.0, "grad_norm": 0.0782100260257721, "learning_rate": 1.7639339543970612e-06, "loss": -0.0021, "num_tokens": 5194888.0, "reward": 11.78781509399414, "reward_std": 4.305365085601807, "rewards/fitness_reward/mean": 7.188657283782959, "rewards/fitness_reward/std": 0.5449937582015991, "rewards/kidney_reward/mean": 2.051370143890381, "rewards/kidney_reward/std": 1.3266252279281616, "rewards/length2tails_reward/mean": 0.6458899974822998, "rewards/length2tails_reward/std": 0.35246819257736206, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.383197784423828, "rewards/thermo_reward/std": 2.690925121307373, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.84375, "completions/mean_terminated_length": 272.84375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11543369013816118, "epoch": 1.198, "frac_reward_zero_std": 0.0, "grad_norm": 0.06506896018981934, "learning_rate": 1.7631058766690839e-06, "loss": -0.0026, "num_tokens": 5203651.0, "reward": 11.880399703979492, "reward_std": 5.43539571762085, "rewards/fitness_reward/mean": 6.639829635620117, "rewards/fitness_reward/std": 2.624504566192627, "rewards/kidney_reward/mean": 2.231795310974121, "rewards/kidney_reward/std": 1.2990399599075317, "rewards/length2tails_reward/mean": 0.8059386610984802, "rewards/length2tails_reward/std": 0.2669394314289093, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.828181266784668, "rewards/thermo_reward/std": 1.7878499031066895, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10662618558853865, "epoch": 1.2, "frac_reward_zero_std": 0.0, "grad_norm": 0.19384585320949554, "learning_rate": 1.762276544205232e-06, "loss": -0.0036, "num_tokens": 5212379.0, "reward": 12.483643531799316, "reward_std": 2.7700603008270264, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.487910747528076, "rewards/kidney_reward/std": 0.6429896354675293, "rewards/length2tails_reward/mean": 0.69349205493927, "rewards/length2tails_reward/std": 0.31337374448776245, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.522707939147949, "rewards/thermo_reward/std": 2.1695806980133057, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.9375, "completions/mean_terminated_length": 271.9375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.1081048995256424, "epoch": 1.202, "frac_reward_zero_std": 0.0, "grad_norm": 0.07642143219709396, "learning_rate": 1.7614459583691342e-06, "loss": 0.0001, "num_tokens": 5221113.0, "reward": 13.319950103759766, "reward_std": 1.2846815586090088, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7720909118652344, "rewards/length2tails_reward/std": 0.27542778849601746, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1824355125427246, "rewards/thermo_reward/std": 1.187772512435913, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.96875, "completions/mean_terminated_length": 271.96875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10744845122098923, "epoch": 1.204, "frac_reward_zero_std": 0.0, "grad_norm": 0.08143097162246704, "learning_rate": 1.7606141205264808e-06, "loss": -0.0027, "num_tokens": 5229848.0, "reward": 12.947037696838379, "reward_std": 2.0212647914886475, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.517043113708496, "rewards/kidney_reward/std": 0.2941751182079315, "rewards/length2tails_reward/mean": 0.7308014035224915, "rewards/length2tails_reward/std": 0.2996106445789337, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8957290649414062, "rewards/thermo_reward/std": 1.8434280157089233, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.0625, "completions/mean_terminated_length": 271.0625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10248364508152008, "epoch": 1.206, "frac_reward_zero_std": 0.0, "grad_norm": 0.13778772950172424, "learning_rate": 1.7597810320450197e-06, "loss": 0.0046, "num_tokens": 5238554.0, "reward": 11.7276611328125, "reward_std": 6.353403091430664, "rewards/fitness_reward/mean": 6.516511917114258, "rewards/fitness_reward/std": 2.9167258739471436, "rewards/kidney_reward/mean": 2.183387517929077, "rewards/kidney_reward/std": 1.5369446277618408, "rewards/length2tails_reward/mean": 0.673989474773407, "rewards/length2tails_reward/std": 0.30423054099082947, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8603644371032715, "rewards/thermo_reward/std": 2.0831940174102783, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.9375, "completions/mean_terminated_length": 270.9375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.10588277224451303, "epoch": 1.208, "frac_reward_zero_std": 0.0, "grad_norm": 0.05363436043262482, "learning_rate": 1.7589466942945555e-06, "loss": -0.0022, "num_tokens": 5247256.0, "reward": 13.326116561889648, "reward_std": 1.268410086631775, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7883108854293823, "rewards/length2tails_reward/std": 0.2947376072406769, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.214339256286621, "rewards/thermo_reward/std": 1.1003755331039429, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.21875, "completions/mean_terminated_length": 272.21875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0990461278706789, "epoch": 1.21, "frac_reward_zero_std": 0.0, "grad_norm": 0.15633194148540497, "learning_rate": 1.7581111086469473e-06, "loss": -0.0049, "num_tokens": 5255999.0, "reward": 12.139235496520996, "reward_std": 4.885914325714111, "rewards/fitness_reward/mean": 6.9905595779418945, "rewards/fitness_reward/std": 2.0965752601623535, "rewards/kidney_reward/mean": 2.3520703315734863, "rewards/kidney_reward/std": 1.1037944555282593, "rewards/length2tails_reward/mean": 0.6999367475509644, "rewards/length2tails_reward/std": 0.3327362835407257, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.6266119480133057, "rewards/thermo_reward/std": 2.315368413925171, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.84375, "completions/mean_terminated_length": 272.84375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.102089773863554, "epoch": 1.212, "frac_reward_zero_std": 0.0, "grad_norm": 0.05936253443360329, "learning_rate": 1.7572742764761053e-06, "loss": -0.0042, "num_tokens": 5264762.0, "reward": 12.085190773010254, "reward_std": 4.8999714851379395, "rewards/fitness_reward/mean": 6.707517147064209, "rewards/fitness_reward/std": 2.5766422748565674, "rewards/kidney_reward/mean": 2.2042856216430664, "rewards/kidney_reward/std": 1.189802646636963, "rewards/length2tails_reward/mean": 0.7570756673812866, "rewards/length2tails_reward/std": 0.33921998739242554, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.997680425643921, "rewards/thermo_reward/std": 1.6423041820526123, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.90625, "completions/mean_terminated_length": 271.90625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0996887730434537, "epoch": 1.214, "frac_reward_zero_std": 0.0, "grad_norm": 0.0814649760723114, "learning_rate": 1.7564361991579904e-06, "loss": -0.0057, "num_tokens": 5273495.0, "reward": 12.056499481201172, "reward_std": 3.3672938346862793, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.4296531677246094, "rewards/kidney_reward/std": 0.5578335523605347, "rewards/length2tails_reward/mean": 0.6635578870773315, "rewards/length2tails_reward/std": 0.35432568192481995, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.4074368476867676, "rewards/thermo_reward/std": 1.871981143951416, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.375, "completions/mean_terminated_length": 270.375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09989192243665457, "epoch": 1.216, "frac_reward_zero_std": 0.0, "grad_norm": 0.08182590454816818, "learning_rate": 1.7555968780706094e-06, "loss": -0.0097, "num_tokens": 5282179.0, "reward": 11.646211624145508, "reward_std": 4.344832897186279, "rewards/fitness_reward/mean": 6.744922161102295, "rewards/fitness_reward/std": 2.4249672889709473, "rewards/kidney_reward/mean": 2.2594614028930664, "rewards/kidney_reward/std": 1.0196256637573242, "rewards/length2tails_reward/mean": 0.5947597026824951, "rewards/length2tails_reward/std": 0.3878384232521057, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 2.4886021614074707, "rewards/thermo_reward/std": 1.9756776094436646, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.375, "completions/mean_terminated_length": 270.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09534382726997137, "epoch": 1.218, "frac_reward_zero_std": 0.0, "grad_norm": 0.0723676010966301, "learning_rate": 1.7547563145940156e-06, "loss": -0.0072, "num_tokens": 5290863.0, "reward": 12.647649765014648, "reward_std": 3.0074362754821777, "rewards/fitness_reward/mean": 6.963343620300293, "rewards/fitness_reward/std": 1.9421277046203613, "rewards/kidney_reward/mean": 2.508197546005249, "rewards/kidney_reward/std": 0.5323163270950317, "rewards/length2tails_reward/mean": 0.6217700839042664, "rewards/length2tails_reward/std": 0.345920205116272, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0139319896698, "rewards/thermo_reward/std": 1.527239441871643, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.53125, "completions/mean_terminated_length": 270.53125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10193273890763521, "epoch": 1.22, "frac_reward_zero_std": 0.0, "grad_norm": 0.05713942274451256, "learning_rate": 1.7539145101103042e-06, "loss": -0.0058, "num_tokens": 5299552.0, "reward": 12.285564422607422, "reward_std": 3.7375283241271973, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.4296531677246094, "rewards/kidney_reward/std": 0.5578335523605347, "rewards/length2tails_reward/mean": 0.6197089552879333, "rewards/length2tails_reward/std": 0.34006068110466003, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.6408867835998535, "rewards/thermo_reward/std": 2.2673563957214355, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 273.28125, "completions/mean_terminated_length": 273.28125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10183690022677183, "epoch": 1.222, "frac_reward_zero_std": 0.0, "grad_norm": 0.061640795320272446, "learning_rate": 1.753071466003611e-06, "loss": -0.0057, "num_tokens": 5308329.0, "reward": 12.557572364807129, "reward_std": 3.8539183139801025, "rewards/fitness_reward/mean": 7.030216693878174, "rewards/fitness_reward/std": 1.8722407817840576, "rewards/kidney_reward/mean": 2.438776969909668, "rewards/kidney_reward/std": 0.6374463438987732, "rewards/length2tails_reward/mean": 0.7697838544845581, "rewards/length2tails_reward/std": 0.32206451892852783, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9116005897521973, "rewards/thermo_reward/std": 1.5846034288406372, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 270.75, "completions/mean_terminated_length": 270.75, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.10672392975538969, "epoch": 1.224, "frac_reward_zero_std": 0.0, "grad_norm": 0.06281083822250366, "learning_rate": 1.75222718366011e-06, "loss": -0.004, "num_tokens": 5317025.0, "reward": 11.681337356567383, "reward_std": 4.935064792633057, "rewards/fitness_reward/mean": 6.867335319519043, "rewards/fitness_reward/std": 2.219416379928589, "rewards/kidney_reward/mean": 2.1714673042297363, "rewards/kidney_reward/std": 1.151047945022583, "rewards/length2tails_reward/mean": 0.6821848154067993, "rewards/length2tails_reward/std": 0.29430484771728516, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.4743170738220215, "rewards/thermo_reward/std": 2.287351608276367, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.59375, "completions/mean_terminated_length": 271.59375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11091689392924309, "epoch": 1.226, "frac_reward_zero_std": 0.0, "grad_norm": 0.08150680363178253, "learning_rate": 1.7513816644680124e-06, "loss": -0.0003, "num_tokens": 5325748.0, "reward": 12.91486930847168, "reward_std": 2.003068208694458, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.650642991065979, "rewards/kidney_reward/mean": 2.517043113708496, "rewards/kidney_reward/std": 0.2941751182079315, "rewards/length2tails_reward/mean": 0.7082419395446777, "rewards/length2tails_reward/std": 0.33722445368766785, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.980835437774658, "rewards/thermo_reward/std": 1.5609627962112427, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.5, "completions/mean_terminated_length": 272.5, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10319388285279274, "epoch": 1.228, "frac_reward_zero_std": 0.0, "grad_norm": 0.15667755901813507, "learning_rate": 1.750534909817561e-06, "loss": 0.0037, "num_tokens": 5334500.0, "reward": 12.450875282287598, "reward_std": 4.654170989990234, "rewards/fitness_reward/mean": 7.026939392089844, "rewards/fitness_reward/std": 1.8907779455184937, "rewards/kidney_reward/mean": 2.3262205123901367, "rewards/kidney_reward/std": 1.1446887254714966, "rewards/length2tails_reward/mean": 0.7560428380966187, "rewards/length2tails_reward/std": 0.32128700613975525, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9221115112304688, "rewards/thermo_reward/std": 2.0271341800689697, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.71875, "completions/mean_terminated_length": 273.71875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10905345156788826, "epoch": 1.23, "frac_reward_zero_std": 0.0, "grad_norm": 0.10238877683877945, "learning_rate": 1.7496869211010314e-06, "loss": -0.0055, "num_tokens": 5343291.0, "reward": 12.982217788696289, "reward_std": 2.0052247047424316, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.517043113708496, "rewards/kidney_reward/std": 0.2941751182079315, "rewards/length2tails_reward/mean": 0.8131692409515381, "rewards/length2tails_reward/std": 0.2821599245071411, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.922672986984253, "rewards/thermo_reward/std": 1.738611102104187, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.4375, "completions/mean_terminated_length": 271.4375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10712381731718779, "epoch": 1.232, "frac_reward_zero_std": 0.0, "grad_norm": 0.07008599489927292, "learning_rate": 1.7488376997127282e-06, "loss": -0.0033, "num_tokens": 5352009.0, "reward": 13.434881210327148, "reward_std": 0.9187362194061279, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.6913747787475586, "rewards/length2tails_reward/std": 0.3248680531978607, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3903069496154785, "rewards/thermo_reward/std": 0.6159141063690186, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 267.96875, "completions/mean_terminated_length": 267.96875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.11229937896132469, "epoch": 1.234, "frac_reward_zero_std": 0.0, "grad_norm": 0.48756271600723267, "learning_rate": 1.7479872470489823e-06, "loss": -0.058, "num_tokens": 5360616.0, "reward": 11.352066040039062, "reward_std": 6.411108493804932, "rewards/fitness_reward/mean": 6.309316635131836, "rewards/fitness_reward/std": 3.334545373916626, "rewards/kidney_reward/mean": 2.0699820518493652, "rewards/kidney_reward/std": 1.5359529256820679, "rewards/length2tails_reward/mean": 0.7619932889938354, "rewards/length2tails_reward/std": 0.31214094161987305, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.796567678451538, "rewards/thermo_reward/std": 2.15085768699646, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.59375, "completions/mean_terminated_length": 272.59375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09793747030198574, "epoch": 1.236, "frac_reward_zero_std": 0.0, "grad_norm": 0.074201799929142, "learning_rate": 1.7471355645081495e-06, "loss": -0.0065, "num_tokens": 5369371.0, "reward": 11.125179290771484, "reward_std": 6.282215118408203, "rewards/fitness_reward/mean": 6.593475341796875, "rewards/fitness_reward/std": 2.824373722076416, "rewards/kidney_reward/mean": 2.0429749488830566, "rewards/kidney_reward/std": 1.5781100988388062, "rewards/length2tails_reward/mean": 0.7332035303115845, "rewards/length2tails_reward/std": 0.3300551772117615, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.315408229827881, "rewards/thermo_reward/std": 2.452349901199341, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.09375, "completions/mean_terminated_length": 271.09375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10096073895692825, "epoch": 1.238, "frac_reward_zero_std": 0.0, "grad_norm": 0.08234567940235138, "learning_rate": 1.7462826534906078e-06, "loss": -0.0054, "num_tokens": 5378078.0, "reward": 12.692102432250977, "reward_std": 3.2898201942443848, "rewards/fitness_reward/mean": 7.0526275634765625, "rewards/fitness_reward/std": 1.7454651594161987, "rewards/kidney_reward/mean": 2.5099849700927734, "rewards/kidney_reward/std": 0.5226497054100037, "rewards/length2tails_reward/mean": 0.6364307403564453, "rewards/length2tails_reward/std": 0.3661468029022217, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9658467769622803, "rewards/thermo_reward/std": 1.6006118059158325, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.09375, "completions/mean_terminated_length": 272.09375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.12040111888200045, "epoch": 1.24, "frac_reward_zero_std": 0.0, "grad_norm": 0.11013831198215485, "learning_rate": 1.7454285153987552e-06, "loss": -0.0011, "num_tokens": 5386817.0, "reward": 13.46084976196289, "reward_std": 0.7325454950332642, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.774829089641571, "rewards/length2tails_reward/std": 0.24128678441047668, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3504209518432617, "rewards/thermo_reward/std": 0.6277978420257568, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.34375, "completions/mean_terminated_length": 271.34375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10458789858967066, "epoch": 1.242, "frac_reward_zero_std": 0.0, "grad_norm": 0.08587851375341415, "learning_rate": 1.744573151637007e-06, "loss": -0.0077, "num_tokens": 5395532.0, "reward": 12.471405982971191, "reward_std": 3.1049158573150635, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.452592670917511, "rewards/kidney_reward/mean": 2.276317596435547, "rewards/kidney_reward/std": 1.0352544784545898, "rewards/length2tails_reward/mean": 0.6744846105575562, "rewards/length2tails_reward/std": 0.343734472990036, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.7814741134643555, "rewards/thermo_reward/std": 1.9983136653900146, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.59375, "completions/mean_terminated_length": 272.59375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10727937240153551, "epoch": 1.244, "frac_reward_zero_std": 0.0, "grad_norm": 0.12728402018547058, "learning_rate": 1.7437165636117939e-06, "loss": -0.004, "num_tokens": 5404287.0, "reward": 13.306811332702637, "reward_std": 1.1299161911010742, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7561923265457153, "rewards/length2tails_reward/std": 0.28997352719306946, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2557549476623535, "rewards/thermo_reward/std": 0.9147730469703674, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.3125, "completions/mean_terminated_length": 271.3125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10531806293874979, "epoch": 1.246, "frac_reward_zero_std": 0.0, "grad_norm": 0.08190429955720901, "learning_rate": 1.7428587527315596e-06, "loss": -0.0059, "num_tokens": 5413001.0, "reward": 12.839922904968262, "reward_std": 2.6034655570983887, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.419279098510742, "rewards/kidney_reward/std": 0.7175997495651245, "rewards/length2tails_reward/mean": 0.6887946128845215, "rewards/length2tails_reward/std": 0.33503562211990356, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8905797004699707, "rewards/thermo_reward/std": 1.9347537755966187, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.65625, "completions/mean_terminated_length": 270.65625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09187165554612875, "epoch": 1.248, "frac_reward_zero_std": 0.0, "grad_norm": 0.08307106792926788, "learning_rate": 1.741999720406759e-06, "loss": -0.0078, "num_tokens": 5421694.0, "reward": 10.21883773803711, "reward_std": 7.325775623321533, "rewards/fitness_reward/mean": 5.902102470397949, "rewards/fitness_reward/std": 3.7705349922180176, "rewards/kidney_reward/mean": 1.8158752918243408, "rewards/kidney_reward/std": 1.697677493095398, "rewards/length2tails_reward/mean": 0.6075935363769531, "rewards/length2tails_reward/std": 0.38769808411598206, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.340100049972534, "rewards/thermo_reward/std": 2.136388063430786, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.34375, "completions/mean_terminated_length": 269.34375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0934913819655776, "epoch": 1.25, "frac_reward_zero_std": 0.0, "grad_norm": 0.09207896888256073, "learning_rate": 1.7411394680498548e-06, "loss": 0.0015, "num_tokens": 5430345.0, "reward": 12.443601608276367, "reward_std": 3.250117301940918, "rewards/fitness_reward/mean": 7.188657760620117, "rewards/fitness_reward/std": 0.7179933190345764, "rewards/kidney_reward/mean": 2.3776919841766357, "rewards/kidney_reward/std": 0.8947495222091675, "rewards/length2tails_reward/mean": 0.49976906180381775, "rewards/length2tails_reward/std": 0.38500484824180603, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.7272748947143555, "rewards/thermo_reward/std": 1.8555277585983276, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 272.1875, "completions/mean_terminated_length": 272.1875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11000367347151041, "epoch": 1.252, "frac_reward_zero_std": 0.0, "grad_norm": 0.07254158705472946, "learning_rate": 1.7402779970753154e-06, "loss": -0.0092, "num_tokens": 5439087.0, "reward": 11.843017578125, "reward_std": 3.7989084720611572, "rewards/fitness_reward/mean": 6.99554443359375, "rewards/fitness_reward/std": 1.7628074884414673, "rewards/kidney_reward/mean": 2.329591751098633, "rewards/kidney_reward/std": 0.8785039782524109, "rewards/length2tails_reward/mean": 0.7139759659767151, "rewards/length2tails_reward/std": 0.33523863554000854, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.3464832305908203, "rewards/thermo_reward/std": 1.9240782260894775, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.09375, "completions/mean_terminated_length": 273.09375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09588021412491798, "epoch": 1.254, "frac_reward_zero_std": 0.0, "grad_norm": 0.0824722871184349, "learning_rate": 1.7394153088996139e-06, "loss": -0.0071, "num_tokens": 5447858.0, "reward": 12.915249824523926, "reward_std": 1.940028190612793, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5070910453796387, "rewards/kidney_reward/std": 0.5383089184761047, "rewards/length2tails_reward/mean": 0.8070980310440063, "rewards/length2tails_reward/std": 0.2656845450401306, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8662641048431396, "rewards/thermo_reward/std": 1.528942584991455, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.28125, "completions/mean_terminated_length": 272.28125, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 0.09873269777745008, "epoch": 1.256, "frac_reward_zero_std": 0.0, "grad_norm": 0.21686367690563202, "learning_rate": 1.738551404941224e-06, "loss": -0.0014, "num_tokens": 5456603.0, "reward": 11.136815071105957, "reward_std": 6.538121223449707, "rewards/fitness_reward/mean": 6.224935531616211, "rewards/fitness_reward/std": 3.4062118530273438, "rewards/kidney_reward/mean": 2.1010398864746094, "rewards/kidney_reward/std": 1.5924327373504639, "rewards/length2tails_reward/mean": 0.7687352299690247, "rewards/length2tails_reward/std": 0.28859299421310425, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.6339666843414307, "rewards/thermo_reward/std": 2.106649160385132, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.6875, "completions/mean_terminated_length": 271.6875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09568661265075207, "epoch": 1.258, "frac_reward_zero_std": 0.0, "grad_norm": 0.10661531984806061, "learning_rate": 1.7376862866206186e-06, "loss": -0.0041, "num_tokens": 5465329.0, "reward": 11.562700271606445, "reward_std": 6.629768371582031, "rewards/fitness_reward/mean": 6.559837341308594, "rewards/fitness_reward/std": 2.9754812717437744, "rewards/kidney_reward/mean": 2.104600429534912, "rewards/kidney_reward/std": 1.686941385269165, "rewards/length2tails_reward/mean": 0.7176464796066284, "rewards/length2tails_reward/std": 0.3340187668800354, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.7264976501464844, "rewards/thermo_reward/std": 2.2392425537109375, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.59375, "completions/mean_terminated_length": 271.59375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09941136837005615, "epoch": 1.26, "frac_reward_zero_std": 0.0, "grad_norm": 0.06564158946275711, "learning_rate": 1.7368199553602674e-06, "loss": -0.0052, "num_tokens": 5474052.0, "reward": 12.620133399963379, "reward_std": 2.574862003326416, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.486088991165161, "rewards/kidney_reward/std": 0.6529962420463562, "rewards/length2tails_reward/mean": 0.7097842693328857, "rewards/length2tails_reward/std": 0.30757448077201843, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.6593899726867676, "rewards/thermo_reward/std": 1.9267350435256958, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.10511992871761322, "epoch": 1.262, "frac_reward_zero_std": 0.0, "grad_norm": 0.0890965685248375, "learning_rate": 1.735952412584635e-06, "loss": -0.0028, "num_tokens": 5482780.0, "reward": 12.693354606628418, "reward_std": 2.413665533065796, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.483823776245117, "rewards/kidney_reward/std": 0.5328060388565063, "rewards/length2tails_reward/mean": 0.7528306245803833, "rewards/length2tails_reward/std": 0.249490424990654, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.673062324523926, "rewards/thermo_reward/std": 2.0427510738372803, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.3125, "completions/mean_terminated_length": 273.3125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10003827605396509, "epoch": 1.264, "frac_reward_zero_std": 0.0, "grad_norm": 0.10966615378856659, "learning_rate": 1.7350836597201765e-06, "loss": -0.0008, "num_tokens": 5491558.0, "reward": 12.770936965942383, "reward_std": 4.350057601928711, "rewards/fitness_reward/mean": 7.010485649108887, "rewards/fitness_reward/std": 1.9838539361953735, "rewards/kidney_reward/mean": 2.367945671081543, "rewards/kidney_reward/std": 1.016711950302124, "rewards/length2tails_reward/mean": 0.779668927192688, "rewards/length2tails_reward/std": 0.3014443516731262, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2145395278930664, "rewards/thermo_reward/std": 1.5500909090042114, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.9375, "completions/mean_terminated_length": 270.9375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10240908991545439, "epoch": 1.266, "frac_reward_zero_std": 0.0, "grad_norm": 0.0859563872218132, "learning_rate": 1.7342136981953387e-06, "loss": 0.0013, "num_tokens": 5500260.0, "reward": 12.530839920043945, "reward_std": 3.073016881942749, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.3721253871917725, "rewards/kidney_reward/std": 0.9938655495643616, "rewards/length2tails_reward/mean": 0.6619828343391418, "rewards/length2tails_reward/std": 0.33176231384277344, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.688839912414551, "rewards/thermo_reward/std": 1.9960085153579712, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.28125, "completions/mean_terminated_length": 271.28125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10022217966616154, "epoch": 1.268, "frac_reward_zero_std": 0.0, "grad_norm": 0.14917409420013428, "learning_rate": 1.7333425294405545e-06, "loss": -0.0025, "num_tokens": 5508973.0, "reward": 12.465929985046387, "reward_std": 2.9090192317962646, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.239741802215576, "rewards/kidney_reward/std": 1.0994014739990234, "rewards/length2tails_reward/mean": 0.6749511361122131, "rewards/length2tails_reward/std": 0.3254489600658417, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.6975083351135254, "rewards/thermo_reward/std": 2.029533624649048, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.46875, "completions/mean_terminated_length": 271.46875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10164427850395441, "epoch": 1.27, "frac_reward_zero_std": 0.0, "grad_norm": 0.07312794029712677, "learning_rate": 1.7324701548882418e-06, "loss": -0.0075, "num_tokens": 5517692.0, "reward": 13.093249320983887, "reward_std": 1.8741158246994019, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7035744190216064, "rewards/length2tails_reward/std": 0.3234020471572876, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.96258544921875, "rewards/thermo_reward/std": 1.7842705249786377, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.5625, "completions/mean_terminated_length": 272.5625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09657697845250368, "epoch": 1.272, "frac_reward_zero_std": 0.0, "grad_norm": 0.08342888206243515, "learning_rate": 1.7315965759728013e-06, "loss": -0.0092, "num_tokens": 5526446.0, "reward": 12.716133117675781, "reward_std": 2.338914632797241, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.41390061378479, "rewards/kidney_reward/std": 0.6353449821472168, "rewards/length2tails_reward/mean": 0.7105174660682678, "rewards/length2tails_reward/std": 0.35961011052131653, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.76999568939209, "rewards/thermo_reward/std": 1.8458997011184692, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.28125, "completions/mean_terminated_length": 272.28125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09856419172137976, "epoch": 1.274, "frac_reward_zero_std": 0.0, "grad_norm": 0.07856637984514236, "learning_rate": 1.7307217941306143e-06, "loss": -0.0037, "num_tokens": 5535191.0, "reward": 12.767711639404297, "reward_std": 3.3116235733032227, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.457012176513672, "rewards/kidney_reward/std": 0.5447914004325867, "rewards/length2tails_reward/mean": 0.8048985004425049, "rewards/length2tails_reward/std": 0.26381057500839233, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.077155590057373, "rewards/thermo_reward/std": 1.5934613943099976, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.6875, "completions/mean_terminated_length": 271.6875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09590794518589973, "epoch": 1.276, "frac_reward_zero_std": 0.0, "grad_norm": 0.10152795165777206, "learning_rate": 1.7298458108000397e-06, "loss": -0.0103, "num_tokens": 5543917.0, "reward": 12.167291641235352, "reward_std": 3.5369625091552734, "rewards/fitness_reward/mean": 7.015777587890625, "rewards/fitness_reward/std": 1.9539211988449097, "rewards/kidney_reward/mean": 2.3080854415893555, "rewards/kidney_reward/std": 0.8558576107025146, "rewards/length2tails_reward/mean": 0.7294836044311523, "rewards/length2tails_reward/std": 0.3052122890949249, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.670480728149414, "rewards/thermo_reward/std": 2.1437149047851562, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 268.21875, "completions/mean_terminated_length": 268.21875, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.1038109278306365, "epoch": 1.278, "frac_reward_zero_std": 0.0, "grad_norm": 0.07300246506929398, "learning_rate": 1.7289686274214115e-06, "loss": -0.0044, "num_tokens": 5552532.0, "reward": 12.078481674194336, "reward_std": 4.850605010986328, "rewards/fitness_reward/mean": 7.006332874298096, "rewards/fitness_reward/std": 2.0073466300964355, "rewards/kidney_reward/mean": 2.214355945587158, "rewards/kidney_reward/std": 1.273207664489746, "rewards/length2tails_reward/mean": 0.6589057445526123, "rewards/length2tails_reward/std": 0.337287575006485, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.6919021606445312, "rewards/thermo_reward/std": 2.128614664077759, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.65625, "completions/mean_terminated_length": 272.65625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11321705486625433, "epoch": 1.28, "frac_reward_zero_std": 0.0, "grad_norm": 0.08817020803689957, "learning_rate": 1.728090245437038e-06, "loss": -0.0026, "num_tokens": 5561289.0, "reward": 13.19006633758545, "reward_std": 1.383169174194336, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.7889949083328247, "rewards/length2tails_reward/std": 0.24998170137405396, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1055798530578613, "rewards/thermo_reward/std": 1.1738027334213257, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.1875, "completions/mean_terminated_length": 272.1875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.09994053188711405, "epoch": 1.282, "frac_reward_zero_std": 0.0, "grad_norm": 0.11618512123823166, "learning_rate": 1.7272106662911971e-06, "loss": -0.0028, "num_tokens": 5570031.0, "reward": 11.90559196472168, "reward_std": 3.7982654571533203, "rewards/fitness_reward/mean": 6.99554443359375, "rewards/fitness_reward/std": 1.7628074884414673, "rewards/kidney_reward/mean": 2.3164985179901123, "rewards/kidney_reward/std": 0.8215380907058716, "rewards/length2tails_reward/mean": 0.7287619113922119, "rewards/length2tails_reward/std": 0.32916101813316345, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.4206724166870117, "rewards/thermo_reward/std": 2.0186150074005127, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.78125, "completions/mean_terminated_length": 270.78125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10221993364393711, "epoch": 1.284, "frac_reward_zero_std": 0.0, "grad_norm": 0.17188133299350739, "learning_rate": 1.7263298914301365e-06, "loss": 0.0036, "num_tokens": 5578728.0, "reward": 12.869743347167969, "reward_std": 2.4321837425231934, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.401759147644043, "rewards/kidney_reward/std": 0.7849282622337341, "rewards/length2tails_reward/mean": 0.6851349472999573, "rewards/length2tails_reward/std": 0.3111562430858612, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9382848739624023, "rewards/thermo_reward/std": 1.7535111904144287, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11419525649398565, "epoch": 1.286, "frac_reward_zero_std": 0.0, "grad_norm": 0.12147994339466095, "learning_rate": 1.7254479223020683e-06, "loss": 0.0008, "num_tokens": 5587456.0, "reward": 10.761012077331543, "reward_std": 6.3070526123046875, "rewards/fitness_reward/mean": 6.451765060424805, "rewards/fitness_reward/std": 2.736926317214966, "rewards/kidney_reward/mean": 1.9459079504013062, "rewards/kidney_reward/std": 1.570999026298523, "rewards/length2tails_reward/mean": 0.682193398475647, "rewards/length2tails_reward/std": 0.3580475449562073, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.1951191425323486, "rewards/thermo_reward/std": 2.5102174282073975, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10469977743923664, "epoch": 1.288, "frac_reward_zero_std": 0.0, "grad_norm": 0.10412470996379852, "learning_rate": 1.7245647603571701e-06, "loss": -0.0009, "num_tokens": 5596172.0, "reward": 11.901531219482422, "reward_std": 4.787817478179932, "rewards/fitness_reward/mean": 7.004397392272949, "rewards/fitness_reward/std": 2.018296003341675, "rewards/kidney_reward/mean": 2.2795729637145996, "rewards/kidney_reward/std": 1.190348744392395, "rewards/length2tails_reward/mean": 0.6881000399589539, "rewards/length2tails_reward/std": 0.3569899797439575, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.4487509727478027, "rewards/thermo_reward/std": 2.2187716960906982, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.625, "completions/mean_terminated_length": 271.625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10657103080302477, "epoch": 1.29, "frac_reward_zero_std": 0.0, "grad_norm": 0.06384288519620895, "learning_rate": 1.7236804070475786e-06, "loss": -0.0035, "num_tokens": 5604896.0, "reward": 11.755149841308594, "reward_std": 4.59630012512207, "rewards/fitness_reward/mean": 6.955345630645752, "rewards/fitness_reward/std": 1.986746907234192, "rewards/kidney_reward/mean": 2.236717700958252, "rewards/kidney_reward/std": 1.0580967664718628, "rewards/length2tails_reward/mean": 0.6883218884468079, "rewards/length2tails_reward/std": 0.3535401523113251, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.394253730773926, "rewards/thermo_reward/std": 2.3135814666748047, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.84375, "completions/mean_terminated_length": 272.84375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10480425134301186, "epoch": 1.292, "frac_reward_zero_std": 0.0, "grad_norm": 0.089121013879776, "learning_rate": 1.7227948638273915e-06, "loss": -0.0065, "num_tokens": 5613659.0, "reward": 12.493803024291992, "reward_std": 2.4889931678771973, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4569358825683594, "rewards/kidney_reward/std": 0.5451717972755432, "rewards/length2tails_reward/mean": 0.7576339244842529, "rewards/length2tails_reward/std": 0.31424081325531006, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.499917984008789, "rewards/thermo_reward/std": 2.142576217651367, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.9375, "completions/mean_terminated_length": 272.9375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.11304991412907839, "epoch": 1.294, "frac_reward_zero_std": 0.0, "grad_norm": 0.0759241059422493, "learning_rate": 1.7219081321526616e-06, "loss": -0.0063, "num_tokens": 5622425.0, "reward": 12.531675338745117, "reward_std": 4.246157646179199, "rewards/fitness_reward/mean": 7.038208484649658, "rewards/fitness_reward/std": 1.8270317316055298, "rewards/kidney_reward/mean": 2.4285740852355957, "rewards/kidney_reward/std": 0.9721502661705017, "rewards/length2tails_reward/mean": 0.7843444347381592, "rewards/length2tails_reward/std": 0.31813105940818787, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8864593505859375, "rewards/thermo_reward/std": 1.7416695356369019, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.1875, "completions/mean_terminated_length": 271.1875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10981705971062183, "epoch": 1.296, "frac_reward_zero_std": 0.0, "grad_norm": 0.09525270760059357, "learning_rate": 1.7210202134813968e-06, "loss": -0.004, "num_tokens": 5631135.0, "reward": 12.248756408691406, "reward_std": 4.042028903961182, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.452592670917511, "rewards/kidney_reward/mean": 2.1447324752807617, "rewards/kidney_reward/std": 1.3550604581832886, "rewards/length2tails_reward/mean": 0.6968757510185242, "rewards/length2tails_reward/std": 0.319642037153244, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.6881699562072754, "rewards/thermo_reward/std": 2.5021729469299316, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.1875, "completions/mean_terminated_length": 271.1875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09238530695438385, "epoch": 1.298, "frac_reward_zero_std": 0.0, "grad_norm": 0.0611051470041275, "learning_rate": 1.7201311092735562e-06, "loss": -0.0065, "num_tokens": 5639845.0, "reward": 11.624662399291992, "reward_std": 5.298251152038574, "rewards/fitness_reward/mean": 6.383270740509033, "rewards/fitness_reward/std": 3.0991289615631104, "rewards/kidney_reward/mean": 2.249495506286621, "rewards/kidney_reward/std": 0.9490680694580078, "rewards/length2tails_reward/mean": 0.6396400928497314, "rewards/length2tails_reward/std": 0.3494209349155426, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8279314041137695, "rewards/thermo_reward/std": 1.9005476236343384, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.3125, "completions/mean_terminated_length": 273.3125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10085057467222214, "epoch": 1.3, "frac_reward_zero_std": 0.0, "grad_norm": 0.08008405566215515, "learning_rate": 1.719240820991048e-06, "loss": -0.0085, "num_tokens": 5648623.0, "reward": 12.24567699432373, "reward_std": 3.9568021297454834, "rewards/fitness_reward/mean": 6.99554443359375, "rewards/fitness_reward/std": 1.7628074884414673, "rewards/kidney_reward/mean": 2.364567279815674, "rewards/kidney_reward/std": 0.8396555185317993, "rewards/length2tails_reward/mean": 0.7675321102142334, "rewards/length2tails_reward/std": 0.32164403796195984, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.7088122367858887, "rewards/thermo_reward/std": 2.098717451095581, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.6875, "completions/mean_terminated_length": 272.6875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10491770878434181, "epoch": 1.302, "frac_reward_zero_std": 0.0, "grad_norm": 0.09322385489940643, "learning_rate": 1.7183493500977275e-06, "loss": -0.0081, "num_tokens": 5657381.0, "reward": 11.527852058410645, "reward_std": 4.960941791534424, "rewards/fitness_reward/mean": 6.880526065826416, "rewards/fitness_reward/std": 1.8558024168014526, "rewards/kidney_reward/mean": 2.10536527633667, "rewards/kidney_reward/std": 1.4573596715927124, "rewards/length2tails_reward/mean": 0.7661879062652588, "rewards/length2tails_reward/std": 0.31442517042160034, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.365342378616333, "rewards/thermo_reward/std": 2.4136619567871094, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.53125, "completions/mean_terminated_length": 271.53125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11029501724988222, "epoch": 1.304, "frac_reward_zero_std": 0.0, "grad_norm": 0.07199763506650925, "learning_rate": 1.717456698059395e-06, "loss": -0.0028, "num_tokens": 5666102.0, "reward": 12.992130279541016, "reward_std": 2.2417500019073486, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.452592670917511, "rewards/kidney_reward/mean": 2.4329657554626465, "rewards/kidney_reward/std": 0.7647271752357483, "rewards/length2tails_reward/mean": 0.6814402937889099, "rewards/length2tails_reward/std": 0.33173587918281555, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.144853115081787, "rewards/thermo_reward/std": 1.3190016746520996, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.78125, "completions/mean_terminated_length": 271.78125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09424859192222357, "epoch": 1.306, "frac_reward_zero_std": 0.0, "grad_norm": 0.05711875110864639, "learning_rate": 1.716562866343792e-06, "loss": -0.0071, "num_tokens": 5674831.0, "reward": 12.767280578613281, "reward_std": 2.380852222442627, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.501613140106201, "rewards/kidney_reward/std": 0.5680586695671082, "rewards/length2tails_reward/mean": 0.6926798820495605, "rewards/length2tails_reward/std": 0.33684486150741577, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.735213279724121, "rewards/thermo_reward/std": 1.9477479457855225, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.1875, "completions/mean_terminated_length": 273.1875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10056958720088005, "epoch": 1.308, "frac_reward_zero_std": 0.0, "grad_norm": 0.07294055819511414, "learning_rate": 1.7156678564206008e-06, "loss": 0.0007, "num_tokens": 5683605.0, "reward": 12.314053535461426, "reward_std": 4.731460094451904, "rewards/fitness_reward/mean": 7.004344463348389, "rewards/fitness_reward/std": 2.0185959339141846, "rewards/kidney_reward/mean": 2.2853360176086426, "rewards/kidney_reward/std": 1.1564087867736816, "rewards/length2tails_reward/mean": 0.8089523315429688, "rewards/length2tails_reward/std": 0.25128456950187683, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.843478202819824, "rewards/thermo_reward/std": 2.112743854522705, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.78125, "completions/mean_terminated_length": 271.78125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.1040391530841589, "epoch": 1.31, "frac_reward_zero_std": 0.0, "grad_norm": 0.15061677992343903, "learning_rate": 1.7147716697614398e-06, "loss": -0.0033, "num_tokens": 5692334.0, "reward": 12.341215133666992, "reward_std": 3.8126730918884277, "rewards/fitness_reward/mean": 6.586376190185547, "rewards/fitness_reward/std": 2.8321077823638916, "rewards/kidney_reward/mean": 2.4342212677001953, "rewards/kidney_reward/std": 0.6610793471336365, "rewards/length2tails_reward/mean": 0.7200385332107544, "rewards/length2tails_reward/std": 0.31445440649986267, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.148613929748535, "rewards/thermo_reward/std": 1.3085203170776367, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.0625, "completions/mean_terminated_length": 272.0625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10377451218664646, "epoch": 1.312, "frac_reward_zero_std": 0.0, "grad_norm": 0.14977595210075378, "learning_rate": 1.713874307839863e-06, "loss": 0.0007, "num_tokens": 5701072.0, "reward": 12.491024017333984, "reward_std": 2.8138327598571777, "rewards/fitness_reward/mean": 7.188657283782959, "rewards/fitness_reward/std": 0.7179933190345764, "rewards/kidney_reward/mean": 2.4353067874908447, "rewards/kidney_reward/std": 0.6554340720176697, "rewards/length2tails_reward/mean": 0.7257574796676636, "rewards/length2tails_reward/std": 0.3124699890613556, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.694483757019043, "rewards/thermo_reward/std": 2.06559681892395, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10568260494619608, "epoch": 1.314, "frac_reward_zero_std": 0.0, "grad_norm": 0.05766943842172623, "learning_rate": 1.7129757721313568e-06, "loss": -0.0062, "num_tokens": 5709788.0, "reward": 12.49569320678711, "reward_std": 3.365452766418457, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.3099136352539062, "rewards/kidney_reward/std": 0.7445457577705383, "rewards/length2tails_reward/mean": 0.7206904888153076, "rewards/length2tails_reward/std": 0.2951766550540924, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9606573581695557, "rewards/thermo_reward/std": 1.4621469974517822, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.96875, "completions/mean_terminated_length": 271.96875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10371254477649927, "epoch": 1.316, "frac_reward_zero_std": 0.0, "grad_norm": 0.0695052519440651, "learning_rate": 1.7120760641133367e-06, "loss": -0.0026, "num_tokens": 5718523.0, "reward": 12.284019470214844, "reward_std": 3.2596521377563477, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.452592670917511, "rewards/kidney_reward/mean": 2.1595919132232666, "rewards/kidney_reward/std": 1.2625459432601929, "rewards/length2tails_reward/mean": 0.7355765700340271, "rewards/length2tails_reward/std": 0.3051132559776306, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.7047033309936523, "rewards/thermo_reward/std": 1.9182021617889404, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.84375, "completions/mean_terminated_length": 271.84375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0999411167576909, "epoch": 1.318, "frac_reward_zero_std": 0.0, "grad_norm": 0.10656177997589111, "learning_rate": 1.7111751852651473e-06, "loss": -0.0069, "num_tokens": 5727254.0, "reward": 11.863635063171387, "reward_std": 5.591757297515869, "rewards/fitness_reward/mean": 6.636547088623047, "rewards/fitness_reward/std": 2.6368114948272705, "rewards/kidney_reward/mean": 2.2023534774780273, "rewards/kidney_reward/std": 1.3406885862350464, "rewards/length2tails_reward/mean": 0.6827249526977539, "rewards/length2tails_reward/std": 0.36245542764663696, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.856461763381958, "rewards/thermo_reward/std": 1.8226885795593262, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.5625, "completions/mean_terminated_length": 271.5625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10554569493979216, "epoch": 1.32, "frac_reward_zero_std": 0.0, "grad_norm": 0.11007196456193924, "learning_rate": 1.710273137068057e-06, "loss": -0.0056, "num_tokens": 5735976.0, "reward": 13.34743881225586, "reward_std": 1.5250986814498901, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.6940564513206482, "rewards/length2tails_reward/std": 0.3022388815879822, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2724452018737793, "rewards/thermo_reward/std": 1.2907878160476685, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.90625, "completions/mean_terminated_length": 270.90625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10180851072072983, "epoch": 1.322, "frac_reward_zero_std": 0.0, "grad_norm": 0.08281734585762024, "learning_rate": 1.7093699210052577e-06, "loss": -0.0059, "num_tokens": 5744677.0, "reward": 12.452281951904297, "reward_std": 3.473741292953491, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.3302063941955566, "rewards/kidney_reward/std": 0.7663140892982483, "rewards/length2tails_reward/mean": 0.6239981651306152, "rewards/length2tails_reward/std": 0.3571479320526123, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.906623125076294, "rewards/thermo_reward/std": 1.6070977449417114, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.8125, "completions/mean_terminated_length": 272.8125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10252044629305601, "epoch": 1.324, "frac_reward_zero_std": 0.0, "grad_norm": 0.06473301351070404, "learning_rate": 1.708465538561861e-06, "loss": -0.0021, "num_tokens": 5753439.0, "reward": 12.452521324157715, "reward_std": 4.619300842285156, "rewards/fitness_reward/mean": 7.002726078033447, "rewards/fitness_reward/std": 2.0277509689331055, "rewards/kidney_reward/mean": 2.3650083541870117, "rewards/kidney_reward/std": 1.032787799835205, "rewards/length2tails_reward/mean": 0.7824528813362122, "rewards/length2tails_reward/std": 0.28417396545410156, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.906541347503662, "rewards/thermo_reward/std": 2.026707887649536, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 271.875, "completions/mean_terminated_length": 271.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09698578109964728, "epoch": 1.326, "frac_reward_zero_std": 0.0, "grad_norm": 0.09350641071796417, "learning_rate": 1.707559991224897e-06, "loss": -0.0027, "num_tokens": 5762171.0, "reward": 11.34994125366211, "reward_std": 5.410051345825195, "rewards/fitness_reward/mean": 6.9835734367370605, "rewards/fitness_reward/std": 2.136094808578491, "rewards/kidney_reward/mean": 2.084345579147339, "rewards/kidney_reward/std": 1.4731652736663818, "rewards/length2tails_reward/mean": 0.7022289633750916, "rewards/length2tails_reward/std": 0.34806326031684875, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.1117992401123047, "rewards/thermo_reward/std": 2.696627140045166, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.625, "completions/mean_terminated_length": 271.625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09784791991114616, "epoch": 1.328, "frac_reward_zero_std": 0.0, "grad_norm": 0.07207152247428894, "learning_rate": 1.70665328048331e-06, "loss": -0.0049, "num_tokens": 5770895.0, "reward": 12.333126068115234, "reward_std": 2.750267267227173, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.3594183921813965, "rewards/kidney_reward/std": 0.8543692231178284, "rewards/length2tails_reward/mean": 0.6763886213302612, "rewards/length2tails_reward/std": 0.33825093507766724, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.444882869720459, "rewards/thermo_reward/std": 2.0760297775268555, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.34375, "completions/mean_terminated_length": 273.34375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.10427253041416407, "epoch": 1.33, "frac_reward_zero_std": 0.0, "grad_norm": 0.13702253997325897, "learning_rate": 1.705745407827959e-06, "loss": 0.0057, "num_tokens": 5779674.0, "reward": 11.255656242370605, "reward_std": 6.599266529083252, "rewards/fitness_reward/mean": 6.487942695617676, "rewards/fitness_reward/std": 2.807030439376831, "rewards/kidney_reward/mean": 1.972966194152832, "rewards/kidney_reward/std": 1.6536844968795776, "rewards/length2tails_reward/mean": 0.7791774272918701, "rewards/length2tails_reward/std": 0.29168036580085754, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.6168298721313477, "rewards/thermo_reward/std": 2.366076946258545, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10135769378393888, "epoch": 1.332, "frac_reward_zero_std": 0.0, "grad_norm": 0.0624493770301342, "learning_rate": 1.7048363747516117e-06, "loss": -0.0052, "num_tokens": 5788390.0, "reward": 12.884943008422852, "reward_std": 2.232584238052368, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4621386528015137, "rewards/kidney_reward/std": 0.6469355821609497, "rewards/length2tails_reward/mean": 0.6847348213195801, "rewards/length2tails_reward/std": 0.33357691764831543, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.89314603805542, "rewards/thermo_reward/std": 1.7530150413513184, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 272.625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10582635086029768, "epoch": 1.334, "frac_reward_zero_std": 0.0, "grad_norm": 0.11112461239099503, "learning_rate": 1.7039261827489448e-06, "loss": -0.0051, "num_tokens": 5797146.0, "reward": 10.896465301513672, "reward_std": 7.279848575592041, "rewards/fitness_reward/mean": 6.5933732986450195, "rewards/fitness_reward/std": 3.0246944427490234, "rewards/kidney_reward/mean": 2.111262798309326, "rewards/kidney_reward/std": 1.919937252998352, "rewards/length2tails_reward/mean": 0.6648454070091248, "rewards/length2tails_reward/std": 0.3326110243797302, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.025343894958496, "rewards/thermo_reward/std": 2.9303271770477295, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10798757802695036, "epoch": 1.336, "frac_reward_zero_std": 0.0, "grad_norm": 0.10530514270067215, "learning_rate": 1.7030148333165406e-06, "loss": -0.0034, "num_tokens": 5805894.0, "reward": 13.327153205871582, "reward_std": 1.369816541671753, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.748895525932312, "rewards/length2tails_reward/std": 0.31100350618362427, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1645989418029785, "rewards/thermo_reward/std": 1.3630008697509766, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.90625, "completions/mean_terminated_length": 271.90625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10481843631714582, "epoch": 1.338, "frac_reward_zero_std": 0.0, "grad_norm": 0.11892516911029816, "learning_rate": 1.702102327952884e-06, "loss": 0.0018, "num_tokens": 5814627.0, "reward": 12.889093399047852, "reward_std": 2.3293163776397705, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.46122407913208, "rewards/kidney_reward/std": 0.6518173217773438, "rewards/length2tails_reward/mean": 0.7261526584625244, "rewards/length2tails_reward/std": 0.31381848454475403, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9515786170959473, "rewards/thermo_reward/std": 1.6045955419540405, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.8125, "completions/mean_terminated_length": 273.8125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.1091268165037036, "epoch": 1.34, "frac_reward_zero_std": 0.0, "grad_norm": 0.1734755039215088, "learning_rate": 1.7011886681583607e-06, "loss": -0.0011, "num_tokens": 5823421.0, "reward": 12.640786170959473, "reward_std": 3.061244010925293, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.650642991065979, "rewards/kidney_reward/mean": 2.376332998275757, "rewards/kidney_reward/std": 0.884257435798645, "rewards/length2tails_reward/mean": 0.7936819791793823, "rewards/length2tails_reward/std": 0.2865562438964844, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8389179706573486, "rewards/thermo_reward/std": 1.8952285051345825, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.9375, "completions/mean_terminated_length": 271.9375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10370411397889256, "epoch": 1.342, "frac_reward_zero_std": 0.0, "grad_norm": 0.11059630662202835, "learning_rate": 1.7002738554352548e-06, "loss": -0.0007, "num_tokens": 5832155.0, "reward": 11.552849769592285, "reward_std": 5.098392963409424, "rewards/fitness_reward/mean": 6.611078262329102, "rewards/fitness_reward/std": 2.732717752456665, "rewards/kidney_reward/mean": 2.1995174884796143, "rewards/kidney_reward/std": 1.1577376127243042, "rewards/length2tails_reward/mean": 0.724226176738739, "rewards/length2tails_reward/std": 0.31711849570274353, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.569831371307373, "rewards/thermo_reward/std": 2.242499589920044, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.90625, "completions/mean_terminated_length": 272.90625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.10955226328223944, "epoch": 1.3439999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.07638181000947952, "learning_rate": 1.6993578912877462e-06, "loss": -0.0004, "num_tokens": 5840920.0, "reward": 12.043489456176758, "reward_std": 4.820775032043457, "rewards/fitness_reward/mean": 6.899222373962402, "rewards/fitness_reward/std": 1.9935646057128906, "rewards/kidney_reward/mean": 2.306669235229492, "rewards/kidney_reward/std": 1.1659220457077026, "rewards/length2tails_reward/mean": 0.8214871287345886, "rewards/length2tails_reward/std": 0.22858977317810059, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.6554486751556396, "rewards/thermo_reward/std": 2.1794092655181885, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 270.0, "completions/mean_terminated_length": 270.0, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.09130106586962938, "epoch": 1.346, "frac_reward_zero_std": 0.0, "grad_norm": 0.1298259198665619, "learning_rate": 1.6984407772219073e-06, "loss": -0.0047, "num_tokens": 5849592.0, "reward": 12.563082695007324, "reward_std": 3.830124616622925, "rewards/fitness_reward/mean": 6.621788024902344, "rewards/fitness_reward/std": 2.7010300159454346, "rewards/kidney_reward/mean": 2.4843716621398926, "rewards/kidney_reward/std": 0.5299732089042664, "rewards/length2tails_reward/mean": 0.6012275218963623, "rewards/length2tails_reward/std": 0.35651201009750366, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2968006134033203, "rewards/thermo_reward/std": 0.906000018119812, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.75, "completions/mean_terminated_length": 272.75, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10582957789301872, "epoch": 1.3479999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.23371417820453644, "learning_rate": 1.6975225147457024e-06, "loss": 0.0039, "num_tokens": 5858352.0, "reward": 12.074682235717773, "reward_std": 5.514390468597412, "rewards/fitness_reward/mean": 6.95579719543457, "rewards/fitness_reward/std": 2.2932186126708984, "rewards/kidney_reward/mean": 2.3026914596557617, "rewards/kidney_reward/std": 1.376892328262329, "rewards/length2tails_reward/mean": 0.7860455513000488, "rewards/length2tails_reward/std": 0.2741868793964386, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.6375892162323, "rewards/thermo_reward/std": 2.3904688358306885, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 270.4375, "completions/mean_terminated_length": 270.4375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09917244967073202, "epoch": 1.35, "frac_reward_zero_std": 0.0, "grad_norm": 0.08062262088060379, "learning_rate": 1.6966031053689827e-06, "loss": -0.0034, "num_tokens": 5867038.0, "reward": 13.281015396118164, "reward_std": 1.455064058303833, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.650111734867096, "rewards/length2tails_reward/std": 0.3185828924179077, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.155698299407959, "rewards/thermo_reward/std": 1.426273226737976, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.375, "completions/mean_terminated_length": 273.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11477402225136757, "epoch": 1.3519999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.06305685639381409, "learning_rate": 1.6956825506034863e-06, "loss": -0.0056, "num_tokens": 5875818.0, "reward": 12.572243690490723, "reward_std": 2.5010037422180176, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.3857381343841553, "rewards/kidney_reward/std": 0.7801180481910706, "rewards/length2tails_reward/mean": 0.7871044874191284, "rewards/length2tails_reward/std": 0.30373579263687134, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.6466102600097656, "rewards/thermo_reward/std": 1.8559621572494507, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.34375, "completions/mean_terminated_length": 272.34375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10278540477156639, "epoch": 1.354, "frac_reward_zero_std": 0.0, "grad_norm": 0.08388179540634155, "learning_rate": 1.6947608519628342e-06, "loss": -0.0005, "num_tokens": 5884565.0, "reward": 13.578239440917969, "reward_std": 0.7047902345657349, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7521419525146484, "rewards/length2tails_reward/std": 0.29483869671821594, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.470078945159912, "rewards/thermo_reward/std": 0.5830413699150085, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 274.15625, "completions/mean_terminated_length": 274.15625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.1032821387052536, "epoch": 1.3559999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.10173728317022324, "learning_rate": 1.693838010962528e-06, "loss": -0.0028, "num_tokens": 5893370.0, "reward": 12.261163711547852, "reward_std": 4.252288341522217, "rewards/fitness_reward/mean": 6.9797563552856445, "rewards/fitness_reward/std": 1.8506603240966797, "rewards/kidney_reward/mean": 2.328238010406494, "rewards/kidney_reward/std": 0.9794105291366577, "rewards/length2tails_reward/mean": 0.8271920680999756, "rewards/length2tails_reward/std": 0.26984670758247375, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.770451545715332, "rewards/thermo_reward/std": 1.8565285205841064, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.71875, "completions/mean_terminated_length": 272.71875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10730791371315718, "epoch": 1.358, "frac_reward_zero_std": 0.0, "grad_norm": 0.07551854848861694, "learning_rate": 1.6929140291199482e-06, "loss": -0.0015, "num_tokens": 5902129.0, "reward": 12.194419860839844, "reward_std": 4.578545570373535, "rewards/fitness_reward/mean": 6.999178886413574, "rewards/fitness_reward/std": 2.0478174686431885, "rewards/kidney_reward/mean": 2.3207554817199707, "rewards/kidney_reward/std": 0.9847736358642578, "rewards/length2tails_reward/mean": 0.747252881526947, "rewards/length2tails_reward/std": 0.31738021969795227, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.6997597217559814, "rewards/thermo_reward/std": 2.0591087341308594, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.46875, "completions/mean_terminated_length": 272.46875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10397808719426394, "epoch": 1.3599999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.12071414291858673, "learning_rate": 1.6919889079543505e-06, "loss": 0.002, "num_tokens": 5910880.0, "reward": 11.56614875793457, "reward_std": 6.464588642120361, "rewards/fitness_reward/mean": 6.585282802581787, "rewards/fitness_reward/std": 2.8306221961975098, "rewards/kidney_reward/mean": 2.123476505279541, "rewards/kidney_reward/std": 1.6230067014694214, "rewards/length2tails_reward/mean": 0.752088189125061, "rewards/length2tails_reward/std": 0.2834548056125641, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.6821799278259277, "rewards/thermo_reward/std": 2.190772533416748, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.8125, "completions/mean_terminated_length": 271.8125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.11202096100896597, "epoch": 1.362, "frac_reward_zero_std": 0.0, "grad_norm": 0.2075084149837494, "learning_rate": 1.6910626489868648e-06, "loss": 0.0027, "num_tokens": 5919610.0, "reward": 12.229260444641113, "reward_std": 4.530606269836426, "rewards/fitness_reward/mean": 6.927616119384766, "rewards/fitness_reward/std": 1.896013855934143, "rewards/kidney_reward/mean": 2.3120498657226562, "rewards/kidney_reward/std": 1.084660291671753, "rewards/length2tails_reward/mean": 0.7117540836334229, "rewards/length2tails_reward/std": 0.3070603907108307, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8184189796447754, "rewards/thermo_reward/std": 1.822995901107788, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 273.6875, "completions/mean_terminated_length": 273.6875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.12129755318164825, "epoch": 1.3639999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.1131335198879242, "learning_rate": 1.69013525374049e-06, "loss": -0.0035, "num_tokens": 5928400.0, "reward": 12.55710220336914, "reward_std": 2.2411751747131348, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4991579055786133, "rewards/kidney_reward/std": 0.581434965133667, "rewards/length2tails_reward/mean": 0.7221885919570923, "rewards/length2tails_reward/std": 0.29065218567848206, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.5245399475097656, "rewards/thermo_reward/std": 1.9252128601074219, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10431716963648796, "epoch": 1.366, "frac_reward_zero_std": 0.0, "grad_norm": 0.13056468963623047, "learning_rate": 1.6892067237400957e-06, "loss": -0.0037, "num_tokens": 5937116.0, "reward": 12.706653594970703, "reward_std": 2.178776502609253, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.3893089294433594, "rewards/kidney_reward/std": 0.7356569170951843, "rewards/length2tails_reward/mean": 0.6970548629760742, "rewards/length2tails_reward/std": 0.3222173750400543, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.786454200744629, "rewards/thermo_reward/std": 1.6334350109100342, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.09375, "completions/mean_terminated_length": 273.09375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10924753546714783, "epoch": 1.3679999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.1930069625377655, "learning_rate": 1.688277060512416e-06, "loss": -0.0012, "num_tokens": 5945887.0, "reward": 12.911041259765625, "reward_std": 2.489048957824707, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.427684783935547, "rewards/kidney_reward/std": 0.6952542662620544, "rewards/length2tails_reward/mean": 0.7463508248329163, "rewards/length2tails_reward/std": 0.34181684255599976, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.005045175552368, "rewards/thermo_reward/std": 1.6862391233444214, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.21875, "completions/mean_terminated_length": 273.21875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10252848453819752, "epoch": 1.37, "frac_reward_zero_std": 0.0, "grad_norm": 0.10934145003557205, "learning_rate": 1.687346265586048e-06, "loss": -0.006, "num_tokens": 5954662.0, "reward": 12.280689239501953, "reward_std": 3.0210068225860596, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.1778016090393066, "rewards/kidney_reward/std": 0.9150532484054565, "rewards/length2tails_reward/mean": 0.7558243870735168, "rewards/length2tails_reward/std": 0.33266839385032654, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.623629093170166, "rewards/thermo_reward/std": 2.0062804222106934, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09957588836550713, "epoch": 1.3719999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.09618765115737915, "learning_rate": 1.6864143404914504e-06, "loss": -0.0034, "num_tokens": 5963410.0, "reward": 13.563940048217773, "reward_std": 0.6711642742156982, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7344149351119995, "rewards/length2tails_reward/std": 0.3277231454849243, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4301929473876953, "rewards/thermo_reward/std": 0.6010707020759583, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09993568249046803, "epoch": 1.374, "frac_reward_zero_std": 0.0, "grad_norm": 0.11227070540189743, "learning_rate": 1.6854812867609395e-06, "loss": -0.0058, "num_tokens": 5972158.0, "reward": 12.18494701385498, "reward_std": 3.8810713291168213, "rewards/fitness_reward/mean": 6.744510650634766, "rewards/fitness_reward/std": 2.426586151123047, "rewards/kidney_reward/mean": 2.402294158935547, "rewards/kidney_reward/std": 0.5692219734191895, "rewards/length2tails_reward/mean": 0.7263046503067017, "rewards/length2tails_reward/std": 0.34151479601860046, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8655123710632324, "rewards/thermo_reward/std": 1.7246884107589722, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.28125, "completions/mean_terminated_length": 272.28125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10269459336996078, "epoch": 1.376, "frac_reward_zero_std": 0.0, "grad_norm": 0.07501363009214401, "learning_rate": 1.6845471059286886e-06, "loss": -0.0071, "num_tokens": 5980903.0, "reward": 12.991816520690918, "reward_std": 2.287548065185547, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.485689640045166, "rewards/kidney_reward/std": 0.6551914811134338, "rewards/length2tails_reward/mean": 0.7599628567695618, "rewards/length2tails_reward/std": 0.2903619408607483, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.026454448699951, "rewards/thermo_reward/std": 1.536690354347229, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.71875, "completions/mean_terminated_length": 273.71875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.1052359938621521, "epoch": 1.3780000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.08972465991973877, "learning_rate": 1.6836117995307225e-06, "loss": -0.0019, "num_tokens": 5989694.0, "reward": 13.074825286865234, "reward_std": 1.664375901222229, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.517043113708496, "rewards/kidney_reward/std": 0.2941751182079315, "rewards/length2tails_reward/mean": 0.8253493309020996, "rewards/length2tails_reward/std": 0.24605530500411987, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0140621662139893, "rewards/thermo_reward/std": 1.5036872625350952, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.10593629349023104, "epoch": 1.38, "frac_reward_zero_std": 0.0, "grad_norm": 0.1128479540348053, "learning_rate": 1.682675369104918e-06, "loss": -0.0049, "num_tokens": 5998458.0, "reward": 12.63668441772461, "reward_std": 2.3853824138641357, "rewards/fitness_reward/mean": 7.131148338317871, "rewards/fitness_reward/std": 0.7751544713973999, "rewards/kidney_reward/mean": 2.4066221714019775, "rewards/kidney_reward/std": 0.672130823135376, "rewards/length2tails_reward/mean": 0.7614470720291138, "rewards/length2tails_reward/std": 0.30095815658569336, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.922769546508789, "rewards/thermo_reward/std": 1.4726288318634033, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 270.8125, "completions/mean_terminated_length": 270.8125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.1032063439488411, "epoch": 1.3820000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.09008792787790298, "learning_rate": 1.6817378161909995e-06, "loss": -0.0049, "num_tokens": 6007156.0, "reward": 12.718839645385742, "reward_std": 3.3231112957000732, "rewards/fitness_reward/mean": 6.999629974365234, "rewards/fitness_reward/std": 2.04526424407959, "rewards/kidney_reward/mean": 2.4188828468322754, "rewards/kidney_reward/std": 0.7191933393478394, "rewards/length2tails_reward/mean": 0.7066649198532104, "rewards/length2tails_reward/std": 0.30196088552474976, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.129659652709961, "rewards/thermo_reward/std": 1.5934593677520752, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 272.125, "completions/mean_terminated_length": 272.125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10248612426221371, "epoch": 1.384, "frac_reward_zero_std": 0.0, "grad_norm": 0.05736776441335678, "learning_rate": 1.6807991423305372e-06, "loss": -0.0026, "num_tokens": 6015896.0, "reward": 13.128643035888672, "reward_std": 1.5915932655334473, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.7266826629638672, "rewards/length2tails_reward/std": 0.3352401554584503, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.050387382507324, "rewards/thermo_reward/std": 1.3753472566604614, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.9375, "completions/mean_terminated_length": 272.9375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10259763896465302, "epoch": 1.3860000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.30000802874565125, "learning_rate": 1.6798593490669444e-06, "loss": 0.003, "num_tokens": 6024662.0, "reward": 12.33034896850586, "reward_std": 5.298914909362793, "rewards/fitness_reward/mean": 6.8621296882629395, "rewards/fitness_reward/std": 2.247586488723755, "rewards/kidney_reward/mean": 2.294316291809082, "rewards/kidney_reward/std": 1.4534074068069458, "rewards/length2tails_reward/mean": 0.7780168056488037, "rewards/length2tails_reward/std": 0.3289325535297394, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.996100902557373, "rewards/thermo_reward/std": 1.8078289031982422, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.65625, "completions/mean_terminated_length": 272.65625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.11150118336081505, "epoch": 1.388, "frac_reward_zero_std": 0.0, "grad_norm": 0.1403384655714035, "learning_rate": 1.678918437945475e-06, "loss": -0.0065, "num_tokens": 6033419.0, "reward": 12.41234016418457, "reward_std": 4.395359992980957, "rewards/fitness_reward/mean": 7.020390510559082, "rewards/fitness_reward/std": 1.9278241395950317, "rewards/kidney_reward/mean": 2.3808698654174805, "rewards/kidney_reward/std": 1.0912655591964722, "rewards/length2tails_reward/mean": 0.7625837922096252, "rewards/length2tails_reward/std": 0.2701271176338196, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8348212242126465, "rewards/thermo_reward/std": 1.7445740699768066, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.40625, "completions/mean_terminated_length": 272.40625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10476927366107702, "epoch": 1.3900000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.08929768949747086, "learning_rate": 1.677976410513221e-06, "loss": -0.002, "num_tokens": 6042168.0, "reward": 13.310680389404297, "reward_std": 1.8905342817306519, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.5304651260375977, "rewards/kidney_reward/std": 0.5431409478187561, "rewards/length2tails_reward/mean": 0.774996280670166, "rewards/length2tails_reward/std": 0.2649703621864319, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.299039602279663, "rewards/thermo_reward/std": 1.1939067840576172, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.6875, "completions/mean_terminated_length": 272.6875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.1053413487970829, "epoch": 1.392, "frac_reward_zero_std": 0.0, "grad_norm": 0.12015575915575027, "learning_rate": 1.6770332683191095e-06, "loss": -0.0012, "num_tokens": 6050926.0, "reward": 11.121023178100586, "reward_std": 6.33602237701416, "rewards/fitness_reward/mean": 6.40739631652832, "rewards/fitness_reward/std": 3.015723943710327, "rewards/kidney_reward/mean": 1.9892921447753906, "rewards/kidney_reward/std": 1.7127926349639893, "rewards/length2tails_reward/mean": 0.750725269317627, "rewards/length2tails_reward/std": 0.3456234037876129, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.549262523651123, "rewards/thermo_reward/std": 2.3009464740753174, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.28125, "completions/mean_terminated_length": 271.28125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.1106234323233366, "epoch": 1.3940000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.10471359640359879, "learning_rate": 1.6760890129139012e-06, "loss": -0.0004, "num_tokens": 6059639.0, "reward": 13.341649055480957, "reward_std": 1.8602107763290405, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.534843683242798, "rewards/kidney_reward/std": 0.5183712840080261, "rewards/length2tails_reward/mean": 0.6905333399772644, "rewards/length2tails_reward/std": 0.3257606327533722, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.276567220687866, "rewards/thermo_reward/std": 1.4345035552978516, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.21875, "completions/mean_terminated_length": 271.21875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10321180988103151, "epoch": 1.396, "frac_reward_zero_std": 0.0, "grad_norm": 0.08855379372835159, "learning_rate": 1.6751436458501868e-06, "loss": -0.006, "num_tokens": 6068350.0, "reward": 12.729233741760254, "reward_std": 2.860873222351074, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.401076316833496, "rewards/kidney_reward/std": 0.9794626235961914, "rewards/length2tails_reward/mean": 0.6787518262863159, "rewards/length2tails_reward/std": 0.3055427372455597, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.7990972995758057, "rewards/thermo_reward/std": 2.0861966609954834, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.09375, "completions/mean_terminated_length": 273.09375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10413977596908808, "epoch": 1.3980000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.36088234186172485, "learning_rate": 1.6741971686823849e-06, "loss": -0.0063, "num_tokens": 6077121.0, "reward": 12.361455917358398, "reward_std": 4.51021146774292, "rewards/fitness_reward/mean": 6.9834794998168945, "rewards/fitness_reward/std": 1.8299298286437988, "rewards/kidney_reward/mean": 2.290621280670166, "rewards/kidney_reward/std": 1.148695945739746, "rewards/length2tails_reward/mean": 0.8251867294311523, "rewards/length2tails_reward/std": 0.23498550057411194, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.904836654663086, "rewards/thermo_reward/std": 1.8687138557434082, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.53125, "completions/mean_terminated_length": 273.53125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09921541810035706, "epoch": 1.4, "frac_reward_zero_std": 0.0, "grad_norm": 0.07918018102645874, "learning_rate": 1.6732495829667395e-06, "loss": -0.0057, "num_tokens": 6085906.0, "reward": 12.201481819152832, "reward_std": 4.22837495803833, "rewards/fitness_reward/mean": 6.993868827819824, "rewards/fitness_reward/std": 1.7721247673034668, "rewards/kidney_reward/mean": 2.2571630477905273, "rewards/kidney_reward/std": 1.0439294576644897, "rewards/length2tails_reward/mean": 0.8327365517616272, "rewards/length2tails_reward/std": 0.23388439416885376, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.7671761512756348, "rewards/thermo_reward/std": 1.8673089742660522, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09286362864077091, "epoch": 1.4020000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.07303542643785477, "learning_rate": 1.6723008902613168e-06, "loss": -0.0066, "num_tokens": 6094622.0, "reward": 12.563545227050781, "reward_std": 3.932305335998535, "rewards/fitness_reward/mean": 6.680659294128418, "rewards/fitness_reward/std": 2.6905438899993896, "rewards/kidney_reward/mean": 2.4843716621398926, "rewards/kidney_reward/std": 0.5299732089042664, "rewards/length2tails_reward/mean": 0.6491783857345581, "rewards/length2tails_reward/std": 0.34321579337120056, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2335963249206543, "rewards/thermo_reward/std": 1.3015984296798706, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09672014974057674, "epoch": 1.404, "frac_reward_zero_std": 0.0, "grad_norm": 0.09479635953903198, "learning_rate": 1.6713510921260038e-06, "loss": -0.0057, "num_tokens": 6103366.0, "reward": 11.757293701171875, "reward_std": 4.412098407745361, "rewards/fitness_reward/mean": 6.99554443359375, "rewards/fitness_reward/std": 1.7628074884414673, "rewards/kidney_reward/mean": 2.1463756561279297, "rewards/kidney_reward/std": 1.3339651823043823, "rewards/length2tails_reward/mean": 0.7187809944152832, "rewards/length2tails_reward/std": 0.3444777727127075, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.443495035171509, "rewards/thermo_reward/std": 2.35288405418396, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.8125, "completions/mean_terminated_length": 271.8125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.10314061678946018, "epoch": 1.4060000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.08774527162313461, "learning_rate": 1.670400190122505e-06, "loss": -0.0032, "num_tokens": 6112096.0, "reward": 12.55300521850586, "reward_std": 2.4551267623901367, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.442668914794922, "rewards/kidney_reward/std": 0.6173912882804871, "rewards/length2tails_reward/mean": 0.7243757843971252, "rewards/length2tails_reward/std": 0.2981627285480499, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.634222984313965, "rewards/thermo_reward/std": 1.8824540376663208, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.65625, "completions/mean_terminated_length": 270.65625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0946558965370059, "epoch": 1.408, "frac_reward_zero_std": 0.0, "grad_norm": 0.10838892310857773, "learning_rate": 1.66944818581434e-06, "loss": 0.0017, "num_tokens": 6120789.0, "reward": 12.890625, "reward_std": 2.3468823432922363, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.4862608909606934, "rewards/kidney_reward/std": 0.6520509719848633, "rewards/length2tails_reward/mean": 0.6216571927070618, "rewards/length2tails_reward/std": 0.3502180874347687, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9385228157043457, "rewards/thermo_reward/std": 1.6677196025848389, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10168962553143501, "epoch": 1.41, "frac_reward_zero_std": 0.0, "grad_norm": 0.0808025449514389, "learning_rate": 1.6684950807668402e-06, "loss": -0.0036, "num_tokens": 6129505.0, "reward": 13.156676292419434, "reward_std": 1.6391278505325317, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.6781437397003174, "rewards/length2tails_reward/std": 0.31954437494277954, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0832748413085938, "rewards/thermo_reward/std": 1.4239745140075684, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.15625, "completions/mean_terminated_length": 271.15625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09764528181403875, "epoch": 1.412, "frac_reward_zero_std": 0.0, "grad_norm": 0.07843001186847687, "learning_rate": 1.6675408765471479e-06, "loss": -0.0046, "num_tokens": 6138214.0, "reward": 13.312134742736816, "reward_std": 1.5263793468475342, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.6829698085784912, "rewards/length2tails_reward/std": 0.3603144586086273, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2108922004699707, "rewards/thermo_reward/std": 1.381186842918396, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.4375, "completions/mean_terminated_length": 271.4375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10325187537819147, "epoch": 1.414, "frac_reward_zero_std": 0.0, "grad_norm": 0.12241241335868835, "learning_rate": 1.6665855747242117e-06, "loss": 0.0035, "num_tokens": 6146932.0, "reward": 11.462259292602539, "reward_std": 6.049312591552734, "rewards/fitness_reward/mean": 6.809179782867432, "rewards/fitness_reward/std": 2.2334835529327393, "rewards/kidney_reward/mean": 2.035914897918701, "rewards/kidney_reward/std": 1.6355434656143188, "rewards/length2tails_reward/mean": 0.6988885402679443, "rewards/length2tails_reward/std": 0.29368865489959717, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.4472761154174805, "rewards/thermo_reward/std": 2.5866122245788574, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.34375, "completions/mean_terminated_length": 271.34375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0953157301992178, "epoch": 1.416, "frac_reward_zero_std": 0.0, "grad_norm": 0.06981104612350464, "learning_rate": 1.6656291768687855e-06, "loss": -0.007, "num_tokens": 6155647.0, "reward": 11.916261672973633, "reward_std": 3.774662971496582, "rewards/fitness_reward/mean": 6.99554443359375, "rewards/fitness_reward/std": 1.7628074884414673, "rewards/kidney_reward/mean": 2.2872872352600098, "rewards/kidney_reward/std": 0.833087682723999, "rewards/length2tails_reward/mean": 0.6991365551948547, "rewards/length2tails_reward/std": 0.3266415297985077, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.4635162353515625, "rewards/thermo_reward/std": 1.992389440536499, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 267.15625, "completions/mean_terminated_length": 267.15625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.11413798667490482, "epoch": 1.418, "frac_reward_zero_std": 0.0, "grad_norm": 0.5822765231132507, "learning_rate": 1.6646716845534256e-06, "loss": -0.0898, "num_tokens": 6164228.0, "reward": 11.752373695373535, "reward_std": 6.4400410652160645, "rewards/fitness_reward/mean": 6.625027179718018, "rewards/fitness_reward/std": 2.897608995437622, "rewards/kidney_reward/mean": 2.1439552307128906, "rewards/kidney_reward/std": 1.5757523775100708, "rewards/length2tails_reward/mean": 0.8399752974510193, "rewards/length2tails_reward/std": 0.22871538996696472, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.799393653869629, "rewards/thermo_reward/std": 2.2180750370025635, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.15625, "completions/mean_terminated_length": 272.15625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11446765344589949, "epoch": 1.42, "frac_reward_zero_std": 0.0, "grad_norm": 0.1195162832736969, "learning_rate": 1.663713099352487e-06, "loss": -0.0033, "num_tokens": 6172969.0, "reward": 10.830946922302246, "reward_std": 6.950724124908447, "rewards/fitness_reward/mean": 5.919102668762207, "rewards/fitness_reward/std": 3.883455276489258, "rewards/kidney_reward/mean": 2.1333961486816406, "rewards/kidney_reward/std": 1.3962254524230957, "rewards/length2tails_reward/mean": 0.741581916809082, "rewards/length2tails_reward/std": 0.30954524874687195, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.60429048538208, "rewards/thermo_reward/std": 2.073528528213501, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.125, "completions/mean_terminated_length": 273.125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10294345114380121, "epoch": 1.422, "frac_reward_zero_std": 0.0, "grad_norm": 0.09308013319969177, "learning_rate": 1.662753422842123e-06, "loss": 0.0011, "num_tokens": 6181741.0, "reward": 12.426822662353516, "reward_std": 4.511376857757568, "rewards/fitness_reward/mean": 7.002735137939453, "rewards/fitness_reward/std": 2.027700424194336, "rewards/kidney_reward/mean": 2.378523826599121, "rewards/kidney_reward/std": 0.9589655995368958, "rewards/length2tails_reward/mean": 0.8055219054222107, "rewards/length2tails_reward/std": 0.2933149039745331, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.865011692047119, "rewards/thermo_reward/std": 1.9666255712509155, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.625, "completions/mean_terminated_length": 270.625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.0997420297935605, "epoch": 1.424, "frac_reward_zero_std": 0.0, "grad_norm": 0.08755825459957123, "learning_rate": 1.6617926566002795e-06, "loss": -0.0001, "num_tokens": 6190433.0, "reward": 11.679386138916016, "reward_std": 5.396921634674072, "rewards/fitness_reward/mean": 6.621771335601807, "rewards/fitness_reward/std": 2.6982581615448, "rewards/kidney_reward/mean": 2.2178728580474854, "rewards/kidney_reward/std": 1.2350701093673706, "rewards/length2tails_reward/mean": 0.6481842994689941, "rewards/length2tails_reward/std": 0.310602068901062, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.6749234199523926, "rewards/thermo_reward/std": 2.1603336334228516, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 274.21875, "completions/mean_terminated_length": 274.21875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10170354135334492, "epoch": 1.426, "frac_reward_zero_std": 0.0, "grad_norm": 0.10588368028402328, "learning_rate": 1.660830802206696e-06, "loss": 0.005, "num_tokens": 6199240.0, "reward": 12.550549507141113, "reward_std": 3.580357313156128, "rewards/fitness_reward/mean": 7.188657283782959, "rewards/fitness_reward/std": 0.7179933190345764, "rewards/kidney_reward/mean": 2.2572848796844482, "rewards/kidney_reward/std": 1.0889778137207031, "rewards/length2tails_reward/mean": 0.7912466526031494, "rewards/length2tails_reward/std": 0.32440313696861267, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.925483226776123, "rewards/thermo_reward/std": 1.9545879364013672, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.8125, "completions/mean_terminated_length": 272.8125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10243859142065048, "epoch": 1.428, "frac_reward_zero_std": 0.0, "grad_norm": 0.07144709676504135, "learning_rate": 1.6598678612429e-06, "loss": -0.0089, "num_tokens": 6208002.0, "reward": 12.510882377624512, "reward_std": 2.848557949066162, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.4377520084381104, "rewards/kidney_reward/std": 0.7783918380737305, "rewards/length2tails_reward/mean": 0.7538530826568604, "rewards/length2tails_reward/std": 0.30126190185546875, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.594069480895996, "rewards/thermo_reward/std": 1.9884313344955444, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.84375, "completions/mean_terminated_length": 272.84375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09988569654524326, "epoch": 1.43, "frac_reward_zero_std": 0.0, "grad_norm": 0.10599660128355026, "learning_rate": 1.658903835292206e-06, "loss": -0.0001, "num_tokens": 6216765.0, "reward": 13.246397018432617, "reward_std": 1.3967713117599487, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.7891009449958801, "rewards/length2tails_reward/std": 0.2741408050060272, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1618995666503906, "rewards/thermo_reward/std": 1.2669503688812256, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11686312593519688, "epoch": 1.432, "frac_reward_zero_std": 0.0, "grad_norm": 0.09374818205833435, "learning_rate": 1.6579387259397126e-06, "loss": -0.0041, "num_tokens": 6225513.0, "reward": 13.120552062988281, "reward_std": 1.6873857975006104, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4534711837768555, "rewards/kidney_reward/std": 0.8334063291549683, "rewards/length2tails_reward/mean": 0.7929362058639526, "rewards/length2tails_reward/std": 0.2758377194404602, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1266021728515625, "rewards/thermo_reward/std": 1.4102927446365356, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.0, "completions/mean_terminated_length": 270.0, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0957229109480977, "epoch": 1.434, "frac_reward_zero_std": 0.0, "grad_norm": 0.08483195304870605, "learning_rate": 1.6569725347722993e-06, "loss": -0.0063, "num_tokens": 6234185.0, "reward": 12.214631080627441, "reward_std": 2.9265432357788086, "rewards/fitness_reward/mean": 7.188657760620117, "rewards/fitness_reward/std": 0.7179933190345764, "rewards/kidney_reward/mean": 2.4142332077026367, "rewards/kidney_reward/std": 0.766402006149292, "rewards/length2tails_reward/mean": 0.5923624038696289, "rewards/length2tails_reward/std": 0.33820590376853943, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.452504873275757, "rewards/thermo_reward/std": 2.009061098098755, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.84375, "completions/mean_terminated_length": 271.84375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09740118542686105, "epoch": 1.436, "frac_reward_zero_std": 0.0, "grad_norm": 0.10787434875965118, "learning_rate": 1.656005263378625e-06, "loss": -0.0034, "num_tokens": 6242916.0, "reward": 12.084197998046875, "reward_std": 5.40502405166626, "rewards/fitness_reward/mean": 6.607909202575684, "rewards/fitness_reward/std": 2.749068021774292, "rewards/kidney_reward/mean": 2.319880247116089, "rewards/kidney_reward/std": 1.0164239406585693, "rewards/length2tails_reward/mean": 0.7005301117897034, "rewards/length2tails_reward/std": 0.35030439496040344, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.986356258392334, "rewards/thermo_reward/std": 1.8370975255966187, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.0, "completions/mean_terminated_length": 270.0, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09824726451188326, "epoch": 1.438, "frac_reward_zero_std": 0.0, "grad_norm": 0.16927476227283478, "learning_rate": 1.6550369133491247e-06, "loss": -0.0026, "num_tokens": 6251588.0, "reward": 12.84244155883789, "reward_std": 2.6119320392608643, "rewards/fitness_reward/mean": 7.014434337615967, "rewards/fitness_reward/std": 1.961518406867981, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.6008568406105042, "rewards/length2tails_reward/std": 0.34025225043296814, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.096160411834717, "rewards/thermo_reward/std": 1.3524459600448608, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.5625, "completions/mean_terminated_length": 271.5625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10022869799286127, "epoch": 1.44, "frac_reward_zero_std": 0.0, "grad_norm": 0.09501426666975021, "learning_rate": 1.654067486276006e-06, "loss": -0.0013, "num_tokens": 6260310.0, "reward": 13.449679374694824, "reward_std": 1.3080042600631714, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.507431745529175, "rewards/kidney_reward/std": 0.5364625453948975, "rewards/length2tails_reward/mean": 0.7026489973068237, "rewards/length2tails_reward/std": 0.34021517634391785, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.410797595977783, "rewards/thermo_reward/std": 1.0919665098190308, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.5, "completions/mean_terminated_length": 271.5, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09479405451565981, "epoch": 1.442, "frac_reward_zero_std": 0.0, "grad_norm": 0.3154640793800354, "learning_rate": 1.6530969837532485e-06, "loss": -0.0045, "num_tokens": 6269030.0, "reward": 13.004693984985352, "reward_std": 2.8685030937194824, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.390768527984619, "rewards/kidney_reward/std": 1.0364271402359009, "rewards/length2tails_reward/mean": 0.6494479775428772, "rewards/length2tails_reward/std": 0.34908527135849, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1453044414520264, "rewards/thermo_reward/std": 1.6958801746368408, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.53125, "completions/mean_terminated_length": 272.53125, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.11144114658236504, "epoch": 1.444, "frac_reward_zero_std": 0.0, "grad_norm": 0.14011171460151672, "learning_rate": 1.6521254073766e-06, "loss": 0.0054, "num_tokens": 6277783.0, "reward": 12.596236228942871, "reward_std": 2.8663110733032227, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.3510470390319824, "rewards/kidney_reward/std": 0.8927178382873535, "rewards/length2tails_reward/mean": 0.7744334936141968, "rewards/length2tails_reward/std": 0.20624327659606934, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.764070510864258, "rewards/thermo_reward/std": 1.86115562915802, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 272.4375, "completions/mean_terminated_length": 272.4375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10096946451812983, "epoch": 1.446, "frac_reward_zero_std": 0.0, "grad_norm": 0.087567999958992, "learning_rate": 1.6511527587435735e-06, "loss": -0.0009, "num_tokens": 6286533.0, "reward": 12.627382278442383, "reward_std": 3.105064630508423, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.3351755142211914, "rewards/kidney_reward/std": 0.95881587266922, "rewards/length2tails_reward/mean": 0.7469046115875244, "rewards/length2tails_reward/std": 0.3319987952709198, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.813839912414551, "rewards/thermo_reward/std": 2.0692062377929688, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.15625, "completions/mean_terminated_length": 272.15625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0993009340018034, "epoch": 1.448, "frac_reward_zero_std": 0.0, "grad_norm": 0.1495305299758911, "learning_rate": 1.650179039453445e-06, "loss": -0.0021, "num_tokens": 6295274.0, "reward": 11.36323070526123, "reward_std": 6.454504013061523, "rewards/fitness_reward/mean": 6.333192825317383, "rewards/fitness_reward/std": 3.254973888397217, "rewards/kidney_reward/mean": 2.0430855751037598, "rewards/kidney_reward/std": 1.639970302581787, "rewards/length2tails_reward/mean": 0.7403050661087036, "rewards/length2tails_reward/std": 0.3240678608417511, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8129220008850098, "rewards/thermo_reward/std": 2.1175599098205566, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.0625, "completions/mean_terminated_length": 271.0625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09985631797462702, "epoch": 1.45, "frac_reward_zero_std": 0.0, "grad_norm": 0.11432749032974243, "learning_rate": 1.6492042511072518e-06, "loss": 0.0021, "num_tokens": 6303980.0, "reward": 13.438583374023438, "reward_std": 1.562416672706604, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.53704833984375, "rewards/kidney_reward/std": 0.5059004426002502, "rewards/length2tails_reward/mean": 0.7021323442459106, "rewards/length2tails_reward/std": 0.2842758297920227, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3701372146606445, "rewards/thermo_reward/std": 1.1033118963241577, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.125, "completions/mean_terminated_length": 272.125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10494589246809483, "epoch": 1.452, "frac_reward_zero_std": 0.0, "grad_norm": 0.09333200752735138, "learning_rate": 1.6482283953077884e-06, "loss": -0.0066, "num_tokens": 6312720.0, "reward": 13.020339965820312, "reward_std": 1.564118504524231, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.7386187314987183, "rewards/length2tails_reward/std": 0.2925185263156891, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9408910274505615, "rewards/thermo_reward/std": 1.4223616123199463, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.34375, "completions/mean_terminated_length": 272.34375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.100801981985569, "epoch": 1.454, "frac_reward_zero_std": 0.0, "grad_norm": 0.10188101977109909, "learning_rate": 1.647251473659604e-06, "loss": -0.0005, "num_tokens": 6321467.0, "reward": 12.78017807006836, "reward_std": 2.7691338062286377, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.399458408355713, "rewards/kidney_reward/std": 0.7088085412979126, "rewards/length2tails_reward/mean": 0.7669350504875183, "rewards/length2tails_reward/std": 0.27515965700149536, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9003515243530273, "rewards/thermo_reward/std": 1.9575797319412231, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.53125, "completions/mean_terminated_length": 271.53125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0940690515562892, "epoch": 1.456, "frac_reward_zero_std": 0.0, "grad_norm": 0.13961991667747498, "learning_rate": 1.6462734877690008e-06, "loss": -0.0085, "num_tokens": 6330188.0, "reward": 12.996550559997559, "reward_std": 1.7360068559646606, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.692048192024231, "rewards/length2tails_reward/std": 0.31852561235427856, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.894399404525757, "rewards/thermo_reward/std": 1.5485317707061768, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.875, "completions/mean_terminated_length": 271.875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10566907282918692, "epoch": 1.458, "frac_reward_zero_std": 0.0, "grad_norm": 0.15134963393211365, "learning_rate": 1.645294439244031e-06, "loss": -0.0018, "num_tokens": 6338920.0, "reward": 11.719990730285645, "reward_std": 4.794181823730469, "rewards/fitness_reward/mean": 6.929496765136719, "rewards/fitness_reward/std": 2.1311330795288086, "rewards/kidney_reward/mean": 2.1761727333068848, "rewards/kidney_reward/std": 1.1675001382827759, "rewards/length2tails_reward/mean": 0.722203254699707, "rewards/length2tails_reward/std": 0.30619093775749207, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.442101001739502, "rewards/thermo_reward/std": 2.2006373405456543, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.34375, "completions/mean_terminated_length": 271.34375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09765789099037647, "epoch": 1.46, "frac_reward_zero_std": 0.0, "grad_norm": 0.054920729249715805, "learning_rate": 1.6443143296944945e-06, "loss": -0.0017, "num_tokens": 6347635.0, "reward": 12.432905197143555, "reward_std": 5.245761871337891, "rewards/fitness_reward/mean": 6.691164016723633, "rewards/fitness_reward/std": 2.6455376148223877, "rewards/kidney_reward/mean": 2.3567051887512207, "rewards/kidney_reward/std": 1.1295703649520874, "rewards/length2tails_reward/mean": 0.7048357725143433, "rewards/length2tails_reward/std": 0.3146979808807373, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2145519256591797, "rewards/thermo_reward/std": 1.5634750127792358, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.75, "completions/mean_terminated_length": 272.75, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09744808170944452, "epoch": 1.462, "frac_reward_zero_std": 0.0, "grad_norm": 0.07000837475061417, "learning_rate": 1.643333160731934e-06, "loss": -0.0001, "num_tokens": 6356395.0, "reward": 12.698062896728516, "reward_std": 4.5932297706604, "rewards/fitness_reward/mean": 6.997720718383789, "rewards/fitness_reward/std": 2.056065320968628, "rewards/kidney_reward/mean": 2.4325098991394043, "rewards/kidney_reward/std": 1.0972591638565063, "rewards/length2tails_reward/mean": 0.7664821743965149, "rewards/length2tails_reward/std": 0.30541515350341797, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.091184139251709, "rewards/thermo_reward/std": 1.6904629468917847, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 270.28125, "completions/mean_terminated_length": 270.28125, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "entropy": 0.09817949496209621, "epoch": 1.464, "frac_reward_zero_std": 0.0, "grad_norm": 0.14221732318401337, "learning_rate": 1.6423509339696362e-06, "loss": -0.0015, "num_tokens": 6365076.0, "reward": 12.266221046447754, "reward_std": 5.32376766204834, "rewards/fitness_reward/mean": 6.971794605255127, "rewards/fitness_reward/std": 2.2027249336242676, "rewards/kidney_reward/mean": 2.2509796619415283, "rewards/kidney_reward/std": 1.3944387435913086, "rewards/length2tails_reward/mean": 0.721784234046936, "rewards/length2tails_reward/std": 0.307940274477005, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8712682723999023, "rewards/thermo_reward/std": 2.0534238815307617, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 272.8125, "completions/mean_terminated_length": 272.8125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.1109353955835104, "epoch": 1.466, "frac_reward_zero_std": 0.0, "grad_norm": 0.08689557015895844, "learning_rate": 1.6413676510226259e-06, "loss": -0.005, "num_tokens": 6373838.0, "reward": 12.710731506347656, "reward_std": 3.480628252029419, "rewards/fitness_reward/mean": 6.987434387207031, "rewards/fitness_reward/std": 2.1142518520355225, "rewards/kidney_reward/mean": 2.431445837020874, "rewards/kidney_reward/std": 0.6755548119544983, "rewards/length2tails_reward/mean": 0.7463986873626709, "rewards/length2tails_reward/std": 0.3106290400028229, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.11721134185791, "rewards/thermo_reward/std": 1.5157802104949951, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.84375, "completions/mean_terminated_length": 271.84375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10329800006002188, "epoch": 1.468, "frac_reward_zero_std": 0.0, "grad_norm": 0.09860417246818542, "learning_rate": 1.640383313507665e-06, "loss": -0.0047, "num_tokens": 6382569.0, "reward": 13.229984283447266, "reward_std": 1.7875175476074219, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7161481976509094, "rewards/length2tails_reward/std": 0.3124900758266449, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0707039833068848, "rewards/thermo_reward/std": 1.7761015892028809, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.4375, "completions/mean_terminated_length": 273.4375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.10924804303795099, "epoch": 1.47, "frac_reward_zero_std": 0.0, "grad_norm": 0.12164352834224701, "learning_rate": 1.6393979230432494e-06, "loss": 0.003, "num_tokens": 6391351.0, "reward": 12.864602088928223, "reward_std": 3.003507375717163, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.650642991065979, "rewards/kidney_reward/mean": 2.3887598514556885, "rewards/kidney_reward/std": 0.9033457040786743, "rewards/length2tails_reward/mean": 0.8108813762664795, "rewards/length2tails_reward/std": 0.2239287793636322, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0485873222351074, "rewards/thermo_reward/std": 1.633050799369812, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.21875, "completions/mean_terminated_length": 271.21875, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "entropy": 0.10441209375858307, "epoch": 1.472, "frac_reward_zero_std": 0.0, "grad_norm": 0.20049253106117249, "learning_rate": 1.6384114812496055e-06, "loss": -0.017, "num_tokens": 6400062.0, "reward": 12.799659729003906, "reward_std": 2.6198184490203857, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.376730442047119, "rewards/kidney_reward/std": 0.783859133720398, "rewards/length2tails_reward/mean": 0.7949988842010498, "rewards/length2tails_reward/std": 0.24802890419960022, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.939753770828247, "rewards/thermo_reward/std": 1.6938996315002441, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.375, "completions/mean_terminated_length": 273.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09739785362035036, "epoch": 1.474, "frac_reward_zero_std": 0.0, "grad_norm": 0.08881720155477524, "learning_rate": 1.6374239897486897e-06, "loss": -0.0008, "num_tokens": 6408842.0, "reward": 12.459671974182129, "reward_std": 4.295812129974365, "rewards/fitness_reward/mean": 7.014317512512207, "rewards/fitness_reward/std": 1.9621782302856445, "rewards/kidney_reward/mean": 2.3737847805023193, "rewards/kidney_reward/std": 0.9848051071166992, "rewards/length2tails_reward/mean": 0.7881827354431152, "rewards/length2tails_reward/std": 0.2944299280643463, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.892751455307007, "rewards/thermo_reward/std": 1.706197738647461, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.75, "completions/mean_terminated_length": 272.75, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10733734723180532, "epoch": 1.476, "frac_reward_zero_std": 0.0, "grad_norm": 0.14616692066192627, "learning_rate": 1.6364354501641833e-06, "loss": 0.0013, "num_tokens": 6417602.0, "reward": 13.177677154541016, "reward_std": 1.8004887104034424, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4556808471679688, "rewards/kidney_reward/std": 0.5514306426048279, "rewards/length2tails_reward/mean": 0.7851049304008484, "rewards/length2tails_reward/std": 0.2981264889240265, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1823010444641113, "rewards/thermo_reward/std": 1.3276283740997314, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.53125, "completions/mean_terminated_length": 272.53125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.10223050322383642, "epoch": 1.478, "frac_reward_zero_std": 0.0, "grad_norm": 0.0845065712928772, "learning_rate": 1.635445864121491e-06, "loss": -0.0022, "num_tokens": 6426355.0, "reward": 13.446077346801758, "reward_std": 0.9884455800056458, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7850735187530518, "rewards/length2tails_reward/std": 0.2458508163690567, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2799057960510254, "rewards/thermo_reward/std": 0.9799438714981079, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.84375, "completions/mean_terminated_length": 271.84375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.10738382302224636, "epoch": 1.48, "frac_reward_zero_std": 0.0, "grad_norm": 0.12464555352926254, "learning_rate": 1.6344552332477386e-06, "loss": 0.0049, "num_tokens": 6435086.0, "reward": 13.028051376342773, "reward_std": 2.2536768913269043, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4271481037139893, "rewards/kidney_reward/std": 0.8362053036689758, "rewards/length2tails_reward/mean": 0.7704687714576721, "rewards/length2tails_reward/std": 0.2345697581768036, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0626707077026367, "rewards/thermo_reward/std": 1.5018627643585205, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.59375, "completions/mean_terminated_length": 272.59375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10015272535383701, "epoch": 1.482, "frac_reward_zero_std": 0.0, "grad_norm": 0.08661410212516785, "learning_rate": 1.63346355917177e-06, "loss": -0.0032, "num_tokens": 6443841.0, "reward": 12.128244400024414, "reward_std": 5.633739948272705, "rewards/fitness_reward/mean": 6.676786422729492, "rewards/fitness_reward/std": 2.7070510387420654, "rewards/kidney_reward/mean": 2.29425048828125, "rewards/kidney_reward/std": 1.291764497756958, "rewards/length2tails_reward/mean": 0.7829782962799072, "rewards/length2tails_reward/std": 0.29065847396850586, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.978910207748413, "rewards/thermo_reward/std": 2.015270948410034, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.3125, "completions/mean_terminated_length": 272.3125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09847946092486382, "epoch": 1.484, "frac_reward_zero_std": 0.0, "grad_norm": 0.09854874759912491, "learning_rate": 1.6324708435241434e-06, "loss": 0.0039, "num_tokens": 6452587.0, "reward": 13.673116683959961, "reward_std": 0.554030179977417, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7548766136169434, "rewards/length2tails_reward/std": 0.26428091526031494, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.50996470451355, "rewards/thermo_reward/std": 0.5615194439888, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 269.34375, "completions/mean_terminated_length": 269.34375, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "entropy": 0.09711272455751896, "epoch": 1.486, "frac_reward_zero_std": 0.0, "grad_norm": 0.4529189169406891, "learning_rate": 1.6314770879371312e-06, "loss": -0.0275, "num_tokens": 6461238.0, "reward": 12.402647972106934, "reward_std": 4.0601325035095215, "rewards/fitness_reward/mean": 6.937617301940918, "rewards/fitness_reward/std": 1.7827101945877075, "rewards/kidney_reward/mean": 2.415391683578491, "rewards/kidney_reward/std": 0.9006344676017761, "rewards/length2tails_reward/mean": 0.692960262298584, "rewards/length2tails_reward/std": 0.31155526638031006, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.880342483520508, "rewards/thermo_reward/std": 1.7204630374908447, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.15625, "completions/mean_terminated_length": 271.15625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0934545025229454, "epoch": 1.488, "frac_reward_zero_std": 0.0, "grad_norm": 0.17958083748817444, "learning_rate": 1.6304822940447136e-06, "loss": -0.0003, "num_tokens": 6469947.0, "reward": 12.78943157196045, "reward_std": 2.1344528198242188, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4513936042785645, "rewards/kidney_reward/std": 0.7045504450798035, "rewards/length2tails_reward/mean": 0.6639407873153687, "rewards/length2tails_reward/std": 0.3690692186355591, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.810458183288574, "rewards/thermo_reward/std": 1.6286946535110474, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.46875, "completions/mean_terminated_length": 272.46875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10307030286639929, "epoch": 1.49, "frac_reward_zero_std": 0.0, "grad_norm": 0.13122333586215973, "learning_rate": 1.6294864634825802e-06, "loss": -0.0041, "num_tokens": 6478698.0, "reward": 13.756085395812988, "reward_std": 0.5152093768119812, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7868427038192749, "rewards/length2tails_reward/std": 0.2657336890697479, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.84375, "completions/mean_terminated_length": 273.84375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10226912703365088, "epoch": 1.492, "frac_reward_zero_std": 0.0, "grad_norm": 0.12127039581537247, "learning_rate": 1.6284895978881234e-06, "loss": 0.0008, "num_tokens": 6487493.0, "reward": 13.357280731201172, "reward_std": 1.3006410598754883, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.517043113708496, "rewards/kidney_reward/std": 0.2941751182079315, "rewards/length2tails_reward/mean": 0.8377007842063904, "rewards/length2tails_reward/std": 0.2545430660247803, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2952828407287598, "rewards/thermo_reward/std": 1.0900053977966309, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10751635394990444, "epoch": 1.494, "frac_reward_zero_std": 0.0, "grad_norm": 0.20295307040214539, "learning_rate": 1.6274916989004388e-06, "loss": 0.0041, "num_tokens": 6496241.0, "reward": 12.564690589904785, "reward_std": 4.798762321472168, "rewards/fitness_reward/mean": 6.9990715980529785, "rewards/fitness_reward/std": 2.0484225749969482, "rewards/kidney_reward/mean": 2.416921615600586, "rewards/kidney_reward/std": 1.185438632965088, "rewards/length2tails_reward/mean": 0.7583932876586914, "rewards/length2tails_reward/std": 0.2566794753074646, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.972858190536499, "rewards/thermo_reward/std": 1.8865139484405518, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.1033137459307909, "epoch": 1.496, "frac_reward_zero_std": 0.0, "grad_norm": 0.11981922388076782, "learning_rate": 1.6264927681603205e-06, "loss": -0.0007, "num_tokens": 6505005.0, "reward": 12.776046752929688, "reward_std": 2.2382259368896484, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.508439064025879, "rewards/kidney_reward/std": 0.5310096144676208, "rewards/length2tails_reward/mean": 0.774960458278656, "rewards/length2tails_reward/std": 0.2797970771789551, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.786435842514038, "rewards/thermo_reward/std": 1.6946086883544922, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.4375, "completions/mean_terminated_length": 270.4375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09651091136038303, "epoch": 1.498, "frac_reward_zero_std": 0.0, "grad_norm": 0.06410472095012665, "learning_rate": 1.6254928073102584e-06, "loss": -0.0056, "num_tokens": 6513691.0, "reward": 12.236513137817383, "reward_std": 4.273045063018799, "rewards/fitness_reward/mean": 6.682684898376465, "rewards/fitness_reward/std": 2.681821346282959, "rewards/kidney_reward/mean": 2.457012176513672, "rewards/kidney_reward/std": 0.5447913408279419, "rewards/length2tails_reward/mean": 0.622907280921936, "rewards/length2tails_reward/std": 0.3190979063510895, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9345245361328125, "rewards/thermo_reward/std": 1.5298142433166504, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.6875, "completions/mean_terminated_length": 272.6875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09542472753673792, "epoch": 1.5, "frac_reward_zero_std": 0.0, "grad_norm": 0.0805036649107933, "learning_rate": 1.6244918179944378e-06, "loss": -0.0053, "num_tokens": 6522449.0, "reward": 12.991552352905273, "reward_std": 3.2086243629455566, "rewards/fitness_reward/mean": 7.051550388336182, "rewards/fitness_reward/std": 1.7515581846237183, "rewards/kidney_reward/mean": 2.533559799194336, "rewards/kidney_reward/std": 0.5256339907646179, "rewards/length2tails_reward/mean": 0.7330061197280884, "rewards/length2tails_reward/std": 0.33843839168548584, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2331414222717285, "rewards/thermo_reward/std": 1.2927289009094238, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.25, "completions/mean_terminated_length": 271.25, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09929024521261454, "epoch": 1.502, "frac_reward_zero_std": 0.0, "grad_norm": 0.0803636759519577, "learning_rate": 1.6234898018587336e-06, "loss": -0.0071, "num_tokens": 6531161.0, "reward": 12.638072967529297, "reward_std": 3.315128803253174, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.356464385986328, "rewards/kidney_reward/std": 0.9754756689071655, "rewards/length2tails_reward/mean": 0.6804892420768738, "rewards/length2tails_reward/std": 0.3451850116252899, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8098840713500977, "rewards/thermo_reward/std": 2.2203240394592285, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10675702150911093, "epoch": 1.504, "frac_reward_zero_std": 0.0, "grad_norm": 0.12016993016004562, "learning_rate": 1.6224867605507092e-06, "loss": -0.003, "num_tokens": 6539909.0, "reward": 13.501442909240723, "reward_std": 0.9707752466201782, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7311062812805176, "rewards/length2tails_reward/std": 0.2865833044052124, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3680262565612793, "rewards/thermo_reward/std": 0.9308540225028992, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.5, "completions/mean_terminated_length": 271.5, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09785383474081755, "epoch": 1.506, "frac_reward_zero_std": 0.0, "grad_norm": 0.13616763055324554, "learning_rate": 1.6214826957196151e-06, "loss": -0.0033, "num_tokens": 6548629.0, "reward": 13.107882499694824, "reward_std": 2.6472818851470947, "rewards/fitness_reward/mean": 6.991186618804932, "rewards/fitness_reward/std": 2.093026638031006, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.7090970873832703, "rewards/length2tails_reward/std": 0.3214232325553894, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.401383876800537, "rewards/thermo_reward/std": 0.9524527192115784, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.6875, "completions/mean_terminated_length": 272.6875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11788608599454165, "epoch": 1.508, "frac_reward_zero_std": 0.0, "grad_norm": 0.058707550168037415, "learning_rate": 1.6204776090163826e-06, "loss": -0.0053, "num_tokens": 6557387.0, "reward": 13.186639785766602, "reward_std": 3.0022521018981934, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.5390896797180176, "rewards/kidney_reward/std": 0.49435171484947205, "rewards/length2tails_reward/mean": 0.7803868055343628, "rewards/length2tails_reward/std": 0.2767482101917267, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4164583683013916, "rewards/thermo_reward/std": 0.8831133842468262, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.71875, "completions/mean_terminated_length": 272.71875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11147108674049377, "epoch": 1.51, "frac_reward_zero_std": 0.0, "grad_norm": 0.2016543447971344, "learning_rate": 1.6194715020936248e-06, "loss": 0.0, "num_tokens": 6566146.0, "reward": 12.690385818481445, "reward_std": 5.008844375610352, "rewards/fitness_reward/mean": 6.974052429199219, "rewards/fitness_reward/std": 2.1899514198303223, "rewards/kidney_reward/mean": 2.2993273735046387, "rewards/kidney_reward/std": 1.3796072006225586, "rewards/length2tails_reward/mean": 0.793893575668335, "rewards/length2tails_reward/std": 0.23800085484981537, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2376160621643066, "rewards/thermo_reward/std": 1.5841246843338013, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.90625, "completions/mean_terminated_length": 272.90625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10156818572431803, "epoch": 1.512, "frac_reward_zero_std": 0.0, "grad_norm": 0.07396720349788666, "learning_rate": 1.6184643766056313e-06, "loss": -0.0046, "num_tokens": 6574911.0, "reward": 13.152985572814941, "reward_std": 1.8636387586593628, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7816914916038513, "rewards/length2tails_reward/std": 0.28421974182128906, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.041869640350342, "rewards/thermo_reward/std": 1.6973748207092285, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.1875, "completions/mean_terminated_length": 272.1875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09756794199347496, "epoch": 1.514, "frac_reward_zero_std": 0.0, "grad_norm": 0.23166704177856445, "learning_rate": 1.6174562342083676e-06, "loss": -0.0055, "num_tokens": 6583653.0, "reward": 12.610737800598145, "reward_std": 3.5867321491241455, "rewards/fitness_reward/mean": 7.0473952293396, "rewards/fitness_reward/std": 1.7750624418258667, "rewards/kidney_reward/mean": 2.4483442306518555, "rewards/kidney_reward/std": 0.588398277759552, "rewards/length2tails_reward/mean": 0.7579518556594849, "rewards/length2tails_reward/std": 0.2971855103969574, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9392032623291016, "rewards/thermo_reward/std": 1.8040894269943237, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 273.96875, "completions/mean_terminated_length": 273.96875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10081325005739927, "epoch": 1.516, "frac_reward_zero_std": 0.0, "grad_norm": 0.15814757347106934, "learning_rate": 1.6164470765594697e-06, "loss": 0.0018, "num_tokens": 6592452.0, "reward": 11.030250549316406, "reward_std": 5.795161724090576, "rewards/fitness_reward/mean": 6.506538391113281, "rewards/fitness_reward/std": 2.9215149879455566, "rewards/kidney_reward/mean": 1.9202601909637451, "rewards/kidney_reward/std": 1.4814492464065552, "rewards/length2tails_reward/mean": 0.7626752853393555, "rewards/length2tails_reward/std": 0.30968478322029114, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.4271841049194336, "rewards/thermo_reward/std": 2.2704720497131348, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.34375, "completions/mean_terminated_length": 271.34375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09835837502032518, "epoch": 1.518, "frac_reward_zero_std": 0.0, "grad_norm": 0.1475781798362732, "learning_rate": 1.6154369053182446e-06, "loss": -0.0036, "num_tokens": 6601167.0, "reward": 12.41716194152832, "reward_std": 3.9362692832946777, "rewards/fitness_reward/mean": 7.052267074584961, "rewards/fitness_reward/std": 1.7475041151046753, "rewards/kidney_reward/mean": 2.367185115814209, "rewards/kidney_reward/std": 0.8782473802566528, "rewards/length2tails_reward/mean": 0.6665036082267761, "rewards/length2tails_reward/std": 0.3518211245536804, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8310587406158447, "rewards/thermo_reward/std": 1.8054473400115967, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 273.4375, "completions/mean_terminated_length": 273.4375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.10087462235242128, "epoch": 1.52, "frac_reward_zero_std": 0.0, "grad_norm": 0.15830565989017487, "learning_rate": 1.6144257221456648e-06, "loss": 0.0067, "num_tokens": 6609949.0, "reward": 12.919963836669922, "reward_std": 2.08066987991333, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.4414336681365967, "rewards/kidney_reward/std": 0.6237428188323975, "rewards/length2tails_reward/mean": 0.8254392147064209, "rewards/length2tails_reward/std": 0.20629006624221802, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.992310047149658, "rewards/thermo_reward/std": 1.2839230298995972, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.10661183949559927, "epoch": 1.522, "frac_reward_zero_std": 0.0, "grad_norm": 0.07809071987867355, "learning_rate": 1.6134135287043666e-06, "loss": -0.0031, "num_tokens": 6618665.0, "reward": 13.041078567504883, "reward_std": 1.7745665311813354, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8089865446090698, "rewards/length2tails_reward/std": 0.2741064727306366, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.927234411239624, "rewards/thermo_reward/std": 1.5899276733398438, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.875, "completions/mean_terminated_length": 271.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09442920796573162, "epoch": 1.524, "frac_reward_zero_std": 0.0, "grad_norm": 0.060308799147605896, "learning_rate": 1.612400326658648e-06, "loss": -0.007, "num_tokens": 6627397.0, "reward": 12.39229965209961, "reward_std": 5.122243404388428, "rewards/fitness_reward/mean": 6.723897933959961, "rewards/fitness_reward/std": 2.509150981903076, "rewards/kidney_reward/mean": 2.3572373390197754, "rewards/kidney_reward/std": 1.1268643140792847, "rewards/length2tails_reward/mean": 0.7106964588165283, "rewards/length2tails_reward/std": 0.32191285490989685, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1400952339172363, "rewards/thermo_reward/std": 1.7281146049499512, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.15625, "completions/mean_terminated_length": 272.15625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09344307612627745, "epoch": 1.526, "frac_reward_zero_std": 0.0, "grad_norm": 0.10824421793222427, "learning_rate": 1.6113861176744657e-06, "loss": -0.0051, "num_tokens": 6636138.0, "reward": 12.915403366088867, "reward_std": 2.598353147506714, "rewards/fitness_reward/mean": 7.011829853057861, "rewards/fitness_reward/std": 1.9762507677078247, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7344396114349365, "rewards/length2tails_reward/std": 0.2898414731025696, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1583681106567383, "rewards/thermo_reward/std": 1.276017665863037, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.71875, "completions/mean_terminated_length": 271.71875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0914773028343916, "epoch": 1.528, "frac_reward_zero_std": 0.0, "grad_norm": 0.07586502283811569, "learning_rate": 1.6103709034194307e-06, "loss": -0.005, "num_tokens": 6644865.0, "reward": 13.459787368774414, "reward_std": 2.1172537803649902, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.5065970420837402, "rewards/kidney_reward/std": 0.6781591773033142, "rewards/length2tails_reward/mean": 0.7208532094955444, "rewards/length2tails_reward/std": 0.2844887971878052, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.477428913116455, "rewards/thermo_reward/std": 1.1407392024993896, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 264.71875, "completions/mean_terminated_length": 264.71875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.10680115036666393, "epoch": 1.53, "frac_reward_zero_std": 0.0, "grad_norm": 0.43259674310684204, "learning_rate": 1.6093546855628081e-06, "loss": -0.0975, "num_tokens": 6653368.0, "reward": 12.1873779296875, "reward_std": 6.2827935218811035, "rewards/fitness_reward/mean": 6.652431488037109, "rewards/fitness_reward/std": 2.7991580963134766, "rewards/kidney_reward/mean": 2.2365856170654297, "rewards/kidney_reward/std": 1.5544345378875732, "rewards/length2tails_reward/mean": 0.696941077709198, "rewards/length2tails_reward/std": 0.3050379753112793, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.128666639328003, "rewards/thermo_reward/std": 1.9798851013183594, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.625, "completions/mean_terminated_length": 273.625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.10760107543319464, "epoch": 1.532, "frac_reward_zero_std": 0.0, "grad_norm": 0.12129205465316772, "learning_rate": 1.608337465775513e-06, "loss": 0.0028, "num_tokens": 6662156.0, "reward": 13.722698211669922, "reward_std": 0.5326492190361023, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8518332242965698, "rewards/length2tails_reward/std": 0.18499179184436798, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5498504638671875, "rewards/thermo_reward/std": 0.5360844731330872, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.8125, "completions/mean_terminated_length": 272.8125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10517558269202709, "epoch": 1.534, "frac_reward_zero_std": 0.0, "grad_norm": 0.11031178385019302, "learning_rate": 1.6073192457301078e-06, "loss": -0.0063, "num_tokens": 6670918.0, "reward": 13.046195983886719, "reward_std": 2.4702224731445312, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.4882965087890625, "rewards/kidney_reward/std": 0.7816820740699768, "rewards/length2tails_reward/mean": 0.7864515781402588, "rewards/length2tails_reward/std": 0.28928881883621216, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0755791664123535, "rewards/thermo_reward/std": 1.6299465894699097, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.9375, "completions/mean_terminated_length": 271.9375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09968086518347263, "epoch": 1.536, "frac_reward_zero_std": 0.0, "grad_norm": 0.11324171721935272, "learning_rate": 1.606300027100799e-06, "loss": -0.0002, "num_tokens": 6679652.0, "reward": 12.413396835327148, "reward_std": 4.348886966705322, "rewards/fitness_reward/mean": 7.004317283630371, "rewards/fitness_reward/std": 2.018749713897705, "rewards/kidney_reward/mean": 2.41861629486084, "rewards/kidney_reward/std": 0.8829330205917358, "rewards/length2tails_reward/mean": 0.7344741821289062, "rewards/length2tails_reward/std": 0.29354095458984375, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8170166015625, "rewards/thermo_reward/std": 1.8177590370178223, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.125, "completions/mean_terminated_length": 272.125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09980903100222349, "epoch": 1.538, "frac_reward_zero_std": 0.0, "grad_norm": 0.07742294669151306, "learning_rate": 1.6052798115634362e-06, "loss": -0.0049, "num_tokens": 6688392.0, "reward": 13.258471488952637, "reward_std": 1.6307097673416138, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7377700805664062, "rewards/length2tails_reward/std": 0.2975505292415619, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.151747941970825, "rewards/thermo_reward/std": 1.4797722101211548, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.1875, "completions/mean_terminated_length": 271.1875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09326224122196436, "epoch": 1.54, "frac_reward_zero_std": 0.0, "grad_norm": 0.06254822760820389, "learning_rate": 1.6042586007955073e-06, "loss": -0.0068, "num_tokens": 6697102.0, "reward": 12.24995231628418, "reward_std": 4.322062969207764, "rewards/fitness_reward/mean": 6.743242263793945, "rewards/fitness_reward/std": 2.4315876960754395, "rewards/kidney_reward/mean": 2.3956949710845947, "rewards/kidney_reward/std": 0.7116798758506775, "rewards/length2tails_reward/mean": 0.6738260984420776, "rewards/length2tails_reward/std": 0.3288188576698303, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9436330795288086, "rewards/thermo_reward/std": 1.6477420330047607, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.84375, "completions/mean_terminated_length": 271.84375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.10175461042672396, "epoch": 1.542, "frac_reward_zero_std": 0.0, "grad_norm": 0.11549469828605652, "learning_rate": 1.6032363964761361e-06, "loss": -0.0004, "num_tokens": 6705833.0, "reward": 13.348580360412598, "reward_std": 1.3180879354476929, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.7467554211616516, "rewards/length2tails_reward/std": 0.2537044584751129, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.268317222595215, "rewards/thermo_reward/std": 1.1638838052749634, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 273.375, "completions/mean_terminated_length": 273.375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.10368968173861504, "epoch": 1.544, "frac_reward_zero_std": 0.0, "grad_norm": 0.0708426684141159, "learning_rate": 1.6022132002860821e-06, "loss": -0.0049, "num_tokens": 6714613.0, "reward": 13.347894668579102, "reward_std": 1.6074857711791992, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8183344602584839, "rewards/length2tails_reward/std": 0.23631368577480316, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2331151962280273, "rewards/thermo_reward/std": 1.455021858215332, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.40625, "completions/mean_terminated_length": 271.40625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09773553442209959, "epoch": 1.546, "frac_reward_zero_std": 0.0, "grad_norm": 0.18035611510276794, "learning_rate": 1.6011890139077333e-06, "loss": -0.0003, "num_tokens": 6723330.0, "reward": 12.25750732421875, "reward_std": 5.029834270477295, "rewards/fitness_reward/mean": 6.723915100097656, "rewards/fitness_reward/std": 2.5090818405151367, "rewards/kidney_reward/mean": 2.3070859909057617, "rewards/kidney_reward/std": 1.111136794090271, "rewards/length2tails_reward/mean": 0.6995996832847595, "rewards/length2tails_reward/std": 0.32060977816581726, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0565457344055176, "rewards/thermo_reward/std": 1.7153652906417847, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.5625, "completions/mean_terminated_length": 272.5625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10099483374506235, "epoch": 1.548, "frac_reward_zero_std": 0.0, "grad_norm": 0.3348585367202759, "learning_rate": 1.6001638390251073e-06, "loss": 0.0036, "num_tokens": 6732084.0, "reward": 12.60731315612793, "reward_std": 4.337095260620117, "rewards/fitness_reward/mean": 7.034456729888916, "rewards/fitness_reward/std": 1.8482544422149658, "rewards/kidney_reward/mean": 2.3530704975128174, "rewards/kidney_reward/std": 1.0146421194076538, "rewards/length2tails_reward/mean": 0.7690781950950623, "rewards/length2tails_reward/std": 0.27084881067276, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.042877674102783, "rewards/thermo_reward/std": 1.8710243701934814, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.03125, "completions/mean_terminated_length": 272.03125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.10179403517395258, "epoch": 1.55, "frac_reward_zero_std": 0.0, "grad_norm": 0.1293950229883194, "learning_rate": 1.5991376773238466e-06, "loss": 0.0014, "num_tokens": 6740821.0, "reward": 12.269551277160645, "reward_std": 5.643449306488037, "rewards/fitness_reward/mean": 6.707425594329834, "rewards/fitness_reward/std": 2.57702374458313, "rewards/kidney_reward/mean": 2.304417848587036, "rewards/kidney_reward/std": 1.4015402793884277, "rewards/length2tails_reward/mean": 0.8244496583938599, "rewards/length2tails_reward/std": 0.2605707049369812, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.075263023376465, "rewards/thermo_reward/std": 1.8867286443710327, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.46875, "completions/mean_terminated_length": 271.46875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09344108309596777, "epoch": 1.552, "frac_reward_zero_std": 0.0, "grad_norm": 0.1531888246536255, "learning_rate": 1.5981105304912159e-06, "loss": -0.0081, "num_tokens": 6749540.0, "reward": 12.162508964538574, "reward_std": 3.732130527496338, "rewards/fitness_reward/mean": 6.8792243003845215, "rewards/fitness_reward/std": 1.8029879331588745, "rewards/kidney_reward/mean": 2.2372934818267822, "rewards/kidney_reward/std": 1.1890099048614502, "rewards/length2tails_reward/mean": 0.6980095505714417, "rewards/length2tails_reward/std": 0.33210060000419617, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.876189947128296, "rewards/thermo_reward/std": 1.869378685951233, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.96875, "completions/mean_terminated_length": 272.96875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0962613089941442, "epoch": 1.554, "frac_reward_zero_std": 0.0, "grad_norm": 0.07587003707885742, "learning_rate": 1.5970824002161006e-06, "loss": -0.0084, "num_tokens": 6758307.0, "reward": 13.13638687133789, "reward_std": 2.2762598991394043, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.4853367805480957, "rewards/kidney_reward/std": 0.6571304202079773, "rewards/length2tails_reward/mean": 0.7716554999351501, "rewards/length2tails_reward/std": 0.2985219657421112, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1702094078063965, "rewards/thermo_reward/std": 1.5183756351470947, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.34375, "completions/mean_terminated_length": 272.34375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10898962616920471, "epoch": 1.556, "frac_reward_zero_std": 0.0, "grad_norm": 0.127036452293396, "learning_rate": 1.5960532881890023e-06, "loss": 0.0081, "num_tokens": 6767054.0, "reward": 13.312883377075195, "reward_std": 0.7643176913261414, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.7653322815895081, "rewards/length2tails_reward/std": 0.28019002079963684, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2307629585266113, "rewards/thermo_reward/std": 0.6471191644668579, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.03125, "completions/mean_terminated_length": 272.03125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10188727173954248, "epoch": 1.558, "frac_reward_zero_std": 0.0, "grad_norm": 0.1719304472208023, "learning_rate": 1.595023196102037e-06, "loss": 0.009, "num_tokens": 6775791.0, "reward": 13.142478942871094, "reward_std": 2.2195565700531006, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4213266372680664, "rewards/kidney_reward/std": 0.7287588119506836, "rewards/length2tails_reward/mean": 0.7504162788391113, "rewards/length2tails_reward/std": 0.2720203995704651, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1849265098571777, "rewards/thermo_reward/std": 1.569002389907837, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.09375, "completions/mean_terminated_length": 271.09375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08891648892313242, "epoch": 1.56, "frac_reward_zero_std": 0.0, "grad_norm": 1.4685585498809814, "learning_rate": 1.5939921256489327e-06, "loss": -0.0054, "num_tokens": 6784498.0, "reward": 13.171124458312988, "reward_std": 2.093531370162964, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.5107901096343994, "rewards/kidney_reward/std": 0.5183009505271912, "rewards/length2tails_reward/mean": 0.7029236555099487, "rewards/length2tails_reward/std": 0.30575695633888245, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.186366081237793, "rewards/thermo_reward/std": 1.5517044067382812, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.0, "completions/mean_terminated_length": 271.0, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 0.10196580365300179, "epoch": 1.562, "frac_reward_zero_std": 0.0, "grad_norm": 0.08939795941114426, "learning_rate": 1.5929600785250256e-06, "loss": -0.0054, "num_tokens": 6793202.0, "reward": 13.353775978088379, "reward_std": 1.6284438371658325, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7209261655807495, "rewards/length2tails_reward/std": 0.29964157938957214, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2213778495788574, "rewards/thermo_reward/std": 1.497393250465393, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.90625, "completions/mean_terminated_length": 271.90625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.11122054141014814, "epoch": 1.564, "frac_reward_zero_std": 0.0, "grad_norm": 0.10926977545022964, "learning_rate": 1.591927056427258e-06, "loss": -0.0009, "num_tokens": 6801935.0, "reward": 12.359277725219727, "reward_std": 3.305664300918579, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.3020734786987305, "rewards/kidney_reward/std": 0.9948132038116455, "rewards/length2tails_reward/mean": 0.7510539293289185, "rewards/length2tails_reward/std": 0.2921181321144104, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.578423500061035, "rewards/thermo_reward/std": 2.3195579051971436, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 272.125, "completions/mean_terminated_length": 272.125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10621493496000767, "epoch": 1.5659999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.07537518441677094, "learning_rate": 1.5908930610541745e-06, "loss": -0.012, "num_tokens": 6810675.0, "reward": 11.830780029296875, "reward_std": 4.0461931228637695, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.144914388656616, "rewards/kidney_reward/std": 1.109388828277588, "rewards/length2tails_reward/mean": 0.7219532132148743, "rewards/length2tails_reward/std": 0.37782686948776245, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.4606173038482666, "rewards/thermo_reward/std": 2.188671112060547, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 273.25, "completions/mean_terminated_length": 273.25, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11602041311562061, "epoch": 1.568, "frac_reward_zero_std": 0.0, "grad_norm": 0.24273134768009186, "learning_rate": 1.5898580941059217e-06, "loss": 0.0094, "num_tokens": 6819451.0, "reward": 13.579347610473633, "reward_std": 1.249585747718811, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7413228750228882, "rewards/length2tails_reward/std": 0.3023105263710022, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4449102878570557, "rewards/thermo_reward/std": 1.1124043464660645, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.9375, "completions/mean_terminated_length": 273.9375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10096332058310509, "epoch": 1.5699999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.08716148883104324, "learning_rate": 1.588822157284242e-06, "loss": -0.005, "num_tokens": 6828249.0, "reward": 13.189170837402344, "reward_std": 1.5594357252120972, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8377348184585571, "rewards/length2tails_reward/std": 0.26007550954818726, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.072450637817383, "rewards/thermo_reward/std": 1.5148123502731323, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10304850526154041, "epoch": 1.572, "frac_reward_zero_std": 0.0, "grad_norm": 0.16281567513942719, "learning_rate": 1.587785252292473e-06, "loss": 0.0046, "num_tokens": 6836993.0, "reward": 13.08975601196289, "reward_std": 2.4612064361572266, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.4109854698181152, "rewards/kidney_reward/std": 0.848101019859314, "rewards/length2tails_reward/mean": 0.7908504605293274, "rewards/length2tails_reward/std": 0.23914951086044312, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1960091590881348, "rewards/thermo_reward/std": 1.4220669269561768, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.10091214813292027, "epoch": 1.5739999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.1641678363084793, "learning_rate": 1.5867473808355452e-06, "loss": 0.001, "num_tokens": 6845737.0, "reward": 13.60064697265625, "reward_std": 1.1486788988113403, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7576706409454346, "rewards/length2tails_reward/std": 0.2726164758205414, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4645745754241943, "rewards/thermo_reward/std": 1.0126454830169678, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.46875, "completions/mean_terminated_length": 272.46875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10151567868888378, "epoch": 1.576, "frac_reward_zero_std": 0.0, "grad_norm": 0.16830457746982574, "learning_rate": 1.5857085446199769e-06, "loss": 0.003, "num_tokens": 6854488.0, "reward": 13.096981048583984, "reward_std": 1.9139457941055298, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.5303173065185547, "rewards/kidney_reward/std": 0.5439756512641907, "rewards/length2tails_reward/mean": 0.7250336408615112, "rewards/length2tails_reward/std": 0.33325880765914917, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0904839038848877, "rewards/thermo_reward/std": 1.2277387380599976, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 274.21875, "completions/mean_terminated_length": 274.21875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.10681565944105387, "epoch": 1.5779999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.0950051099061966, "learning_rate": 1.5846687453538735e-06, "loss": -0.0039, "num_tokens": 6863295.0, "reward": 12.962221145629883, "reward_std": 2.356898784637451, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4387731552124023, "rewards/kidney_reward/std": 0.6374660134315491, "rewards/length2tails_reward/mean": 0.8561804294586182, "rewards/length2tails_reward/std": 0.1828988492488861, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9766440391540527, "rewards/thermo_reward/std": 1.8269686698913574, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.59375, "completions/mean_terminated_length": 271.59375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09641438629478216, "epoch": 1.58, "frac_reward_zero_std": 0.0, "grad_norm": 0.2553471326828003, "learning_rate": 1.5836279847469234e-06, "loss": 0.0014, "num_tokens": 6872018.0, "reward": 12.10179615020752, "reward_std": 5.340146541595459, "rewards/fitness_reward/mean": 6.624960422515869, "rewards/fitness_reward/std": 2.897318124771118, "rewards/kidney_reward/mean": 2.352849006652832, "rewards/kidney_reward/std": 1.0995126962661743, "rewards/length2tails_reward/mean": 0.7194082736968994, "rewards/length2tails_reward/std": 0.30610015988349915, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9520459175109863, "rewards/thermo_reward/std": 1.7063323259353638, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09921843744814396, "epoch": 1.5819999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.09495562314987183, "learning_rate": 1.582586264510396e-06, "loss": -0.0037, "num_tokens": 6880782.0, "reward": 12.491376876831055, "reward_std": 3.032684326171875, "rewards/fitness_reward/mean": 7.188657283782959, "rewards/fitness_reward/std": 0.7179933190345764, "rewards/kidney_reward/mean": 2.3157799243927, "rewards/kidney_reward/std": 0.9158964157104492, "rewards/length2tails_reward/mean": 0.7796016335487366, "rewards/length2tails_reward/std": 0.290948748588562, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8089795112609863, "rewards/thermo_reward/std": 1.6366848945617676, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.0625, "completions/mean_terminated_length": 272.0625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10440836194902658, "epoch": 1.584, "frac_reward_zero_std": 0.0, "grad_norm": 0.11112482845783234, "learning_rate": 1.5815435863571387e-06, "loss": -0.0043, "num_tokens": 6889520.0, "reward": 12.341741561889648, "reward_std": 3.5781798362731934, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.374967098236084, "rewards/kidney_reward/std": 0.794853687286377, "rewards/length2tails_reward/mean": 0.7389723062515259, "rewards/length2tails_reward/std": 0.2798449695110321, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.739823341369629, "rewards/thermo_reward/std": 1.8928732872009277, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.1875, "completions/mean_terminated_length": 272.1875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.1030658707022667, "epoch": 1.5859999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.10627459734678268, "learning_rate": 1.5804999520015733e-06, "loss": -0.0052, "num_tokens": 6898262.0, "reward": 13.374687194824219, "reward_std": 2.236538887023926, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.4553585052490234, "rewards/kidney_reward/std": 0.8229176998138428, "rewards/length2tails_reward/mean": 0.72886723279953, "rewards/length2tails_reward/std": 0.32557907700538635, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.442765951156616, "rewards/thermo_reward/std": 1.1234129667282104, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.34375, "completions/mean_terminated_length": 271.34375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10048514977097511, "epoch": 1.588, "frac_reward_zero_std": 0.0, "grad_norm": 0.05569272115826607, "learning_rate": 1.579455363159695e-06, "loss": -0.0057, "num_tokens": 6906977.0, "reward": 13.202973365783691, "reward_std": 1.7612926959991455, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7030308842658997, "rewards/length2tails_reward/std": 0.3247583210468292, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.099724292755127, "rewards/thermo_reward/std": 1.5753053426742554, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 272.625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10165113396942616, "epoch": 1.5899999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.14155659079551697, "learning_rate": 1.5784098215490666e-06, "loss": -0.0037, "num_tokens": 6915733.0, "reward": 13.36789321899414, "reward_std": 1.1217387914657593, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7168630361557007, "rewards/length2tails_reward/std": 0.3124147355556488, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2632603645324707, "rewards/thermo_reward/std": 1.056195616722107, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.28125, "completions/mean_terminated_length": 272.28125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10210881475359201, "epoch": 1.592, "frac_reward_zero_std": 0.0, "grad_norm": 0.08840259909629822, "learning_rate": 1.5773633288888195e-06, "loss": 0.001, "num_tokens": 6924478.0, "reward": 13.514481544494629, "reward_std": 1.4057046175003052, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.536932945251465, "rewards/kidney_reward/std": 0.5065523982048035, "rewards/length2tails_reward/mean": 0.7745459079742432, "rewards/length2tails_reward/std": 0.2727997303009033, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4389090538024902, "rewards/thermo_reward/std": 0.9537463784217834, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.09803248103708029, "epoch": 1.5939999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.09315460920333862, "learning_rate": 1.5763158868996475e-06, "loss": -0.0068, "num_tokens": 6933222.0, "reward": 12.848220825195312, "reward_std": 2.481563091278076, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.448021173477173, "rewards/kidney_reward/std": 0.590040922164917, "rewards/length2tails_reward/mean": 0.7305286526679993, "rewards/length2tails_reward/std": 0.3285638391971588, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.923471450805664, "rewards/thermo_reward/std": 1.7929234504699707, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.84375, "completions/mean_terminated_length": 271.84375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09597568307071924, "epoch": 1.596, "frac_reward_zero_std": 0.0, "grad_norm": 0.06843114644289017, "learning_rate": 1.5752674973038059e-06, "loss": 0.0002, "num_tokens": 6941953.0, "reward": 13.095829010009766, "reward_std": 1.8167020082473755, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4896838665008545, "rewards/kidney_reward/std": 0.3229711949825287, "rewards/length2tails_reward/mean": 0.7349690198898315, "rewards/length2tails_reward/std": 0.28378915786743164, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.071463108062744, "rewards/thermo_reward/std": 1.5440276861190796, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.90625, "completions/mean_terminated_length": 270.90625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.1026212815195322, "epoch": 1.5979999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.1733075976371765, "learning_rate": 1.574218161825108e-06, "loss": -0.0014, "num_tokens": 6950654.0, "reward": 11.799349784851074, "reward_std": 5.825907230377197, "rewards/fitness_reward/mean": 6.423315048217773, "rewards/fitness_reward/std": 2.9632887840270996, "rewards/kidney_reward/mean": 2.2777504920959473, "rewards/kidney_reward/std": 1.1740972995758057, "rewards/length2tails_reward/mean": 0.6755983829498291, "rewards/length2tails_reward/std": 0.2964775860309601, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 2.93697452545166, "rewards/thermo_reward/std": 1.9402716159820557, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.6875, "completions/mean_terminated_length": 271.6875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10204800125211477, "epoch": 1.6, "frac_reward_zero_std": 0.0, "grad_norm": 0.14254425466060638, "learning_rate": 1.5731678821889222e-06, "loss": 0.0027, "num_tokens": 6959380.0, "reward": 13.141883850097656, "reward_std": 1.9513304233551025, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.411172389984131, "rewards/kidney_reward/std": 0.649071216583252, "rewards/length2tails_reward/mean": 0.6890884637832642, "rewards/length2tails_reward/std": 0.3426918089389801, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.258126735687256, "rewards/thermo_reward/std": 1.080284595489502, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09764991933479905, "epoch": 1.6019999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.1123654767870903, "learning_rate": 1.5721166601221695e-06, "loss": 0.0052, "num_tokens": 6968108.0, "reward": 13.41652774810791, "reward_std": 2.3344967365264893, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.5197842121124268, "rewards/kidney_reward/std": 0.6035600304603577, "rewards/length2tails_reward/mean": 0.7101686000823975, "rewards/length2tails_reward/std": 0.32164332270622253, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4220504760742188, "rewards/thermo_reward/std": 1.4359506368637085, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.0, "completions/mean_terminated_length": 272.0, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09724361402913928, "epoch": 1.604, "frac_reward_zero_std": 0.0, "grad_norm": 0.05826536566019058, "learning_rate": 1.5710644973533207e-06, "loss": -0.002, "num_tokens": 6976844.0, "reward": 13.142796516418457, "reward_std": 2.4546236991882324, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4166100025177, "rewards/kidney_reward/std": 0.8939437866210938, "rewards/length2tails_reward/mean": 0.7290231585502625, "rewards/length2tails_reward/std": 0.33181530237197876, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.192098617553711, "rewards/thermo_reward/std": 1.6234620809555054, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.53125, "completions/mean_terminated_length": 271.53125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09845475014299154, "epoch": 1.6059999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.11397599428892136, "learning_rate": 1.570011395612393e-06, "loss": -0.0044, "num_tokens": 6985565.0, "reward": 13.158821105957031, "reward_std": 2.5375492572784424, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.423973560333252, "rewards/kidney_reward/std": 0.7991431355476379, "rewards/length2tails_reward/mean": 0.7127482891082764, "rewards/length2tails_reward/std": 0.31570836901664734, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2598977088928223, "rewards/thermo_reward/std": 1.5675069093704224, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.5625, "completions/mean_terminated_length": 272.5625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09938731510192156, "epoch": 1.608, "frac_reward_zero_std": 0.0, "grad_norm": 0.13295812904834747, "learning_rate": 1.568957356630947e-06, "loss": -0.0022, "num_tokens": 6994319.0, "reward": 13.225662231445312, "reward_std": 2.6128029823303223, "rewards/fitness_reward/mean": 6.987685203552246, "rewards/fitness_reward/std": 2.1128344535827637, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7760343551635742, "rewards/length2tails_reward/std": 0.284006267786026, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.488612651824951, "rewards/thermo_reward/std": 0.8943835496902466, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.6875, "completions/mean_terminated_length": 271.6875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.11059173010289669, "epoch": 1.6099999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.10347039252519608, "learning_rate": 1.567902382142086e-06, "loss": -0.0026, "num_tokens": 7003045.0, "reward": 12.732093811035156, "reward_std": 2.605804204940796, "rewards/fitness_reward/mean": 7.188657283782959, "rewards/fitness_reward/std": 0.7179933190345764, "rewards/kidney_reward/mean": 2.3838021755218506, "rewards/kidney_reward/std": 0.7902752757072449, "rewards/length2tails_reward/mean": 0.7132542729377747, "rewards/length2tails_reward/std": 0.2912064790725708, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.988309621810913, "rewards/thermo_reward/std": 1.5316686630249023, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.78125, "completions/mean_terminated_length": 271.78125, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "entropy": 0.09359847661107779, "epoch": 1.612, "frac_reward_zero_std": 0.0, "grad_norm": 0.0684969425201416, "learning_rate": 1.56684647388045e-06, "loss": -0.0072, "num_tokens": 7011774.0, "reward": 13.25399398803711, "reward_std": 1.4808253049850464, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4519968032836914, "rewards/kidney_reward/std": 0.8416019678115845, "rewards/length2tails_reward/mean": 0.7692837715148926, "rewards/length2tails_reward/std": 0.29183003306388855, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2638840675354004, "rewards/thermo_reward/std": 1.1789251565933228, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09691243059933186, "epoch": 1.6139999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.06483133137226105, "learning_rate": 1.5657896335822145e-06, "loss": -0.0044, "num_tokens": 7020518.0, "reward": 13.43472671508789, "reward_std": 1.8454060554504395, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5281412601470947, "rewards/kidney_reward/std": 0.556286096572876, "rewards/length2tails_reward/mean": 0.7682965993881226, "rewards/length2tails_reward/std": 0.25869956612586975, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3685708045959473, "rewards/thermo_reward/std": 1.308498501777649, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09843306802213192, "epoch": 1.616, "frac_reward_zero_std": 0.0, "grad_norm": 0.09614210575819016, "learning_rate": 1.5647318629850883e-06, "loss": -0.0024, "num_tokens": 7029262.0, "reward": 12.651477813720703, "reward_std": 3.146268606185913, "rewards/fitness_reward/mean": 6.980320930480957, "rewards/fitness_reward/std": 2.154494524002075, "rewards/kidney_reward/mean": 2.462048053741455, "rewards/kidney_reward/std": 0.6474180817604065, "rewards/length2tails_reward/mean": 0.6856108903884888, "rewards/length2tails_reward/std": 0.3616497218608856, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0405473709106445, "rewards/thermo_reward/std": 1.6127054691314697, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09584531467407942, "epoch": 1.6179999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.08554235845804214, "learning_rate": 1.563673163828309e-06, "loss": -0.0034, "num_tokens": 7038026.0, "reward": 13.498817443847656, "reward_std": 1.6057590246200562, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.491739273071289, "rewards/kidney_reward/std": 0.621989369392395, "rewards/length2tails_reward/mean": 0.8107680082321167, "rewards/length2tails_reward/std": 0.22752250730991364, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.464815378189087, "rewards/thermo_reward/std": 1.0114378929138184, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.125, "completions/mean_terminated_length": 272.125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09629400447010994, "epoch": 1.62, "frac_reward_zero_std": 0.0, "grad_norm": 0.1261540800333023, "learning_rate": 1.5626135378526417e-06, "loss": 0.0003, "num_tokens": 7046766.0, "reward": 12.858604431152344, "reward_std": 4.045932769775391, "rewards/fitness_reward/mean": 7.010948657989502, "rewards/fitness_reward/std": 1.9812366962432861, "rewards/kidney_reward/mean": 2.477144718170166, "rewards/kidney_reward/std": 0.8447661399841309, "rewards/length2tails_reward/mean": 0.7198797464370728, "rewards/length2tails_reward/std": 0.3227717876434326, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.19852352142334, "rewards/thermo_reward/std": 1.4419629573822021, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 272.625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.11237006168812513, "epoch": 1.6219999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 1.2420001029968262, "learning_rate": 1.5615529868003747e-06, "loss": -0.0043, "num_tokens": 7055522.0, "reward": 12.745379447937012, "reward_std": 2.7641563415527344, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.388613224029541, "rewards/kidney_reward/std": 0.904141366481781, "rewards/length2tails_reward/mean": 0.7696863412857056, "rewards/length2tails_reward/std": 0.28826603293418884, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.876121997833252, "rewards/thermo_reward/std": 1.8102315664291382, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 276.03125, "completions/mean_terminated_length": 276.03125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.10469907149672508, "epoch": 1.624, "frac_reward_zero_std": 0.0, "grad_norm": 0.1210826188325882, "learning_rate": 1.5604915124153179e-06, "loss": -0.0015, "num_tokens": 7064387.0, "reward": 13.120941162109375, "reward_std": 1.6486657857894897, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.517043113708496, "rewards/kidney_reward/std": 0.2941751182079315, "rewards/length2tails_reward/mean": 0.883176326751709, "rewards/length2tails_reward/std": 0.18250958621501923, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0543954372406006, "rewards/thermo_reward/std": 1.4460625648498535, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.5, "completions/mean_terminated_length": 273.5, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09974722750484943, "epoch": 1.626, "frac_reward_zero_std": 0.0, "grad_norm": 0.10284367948770523, "learning_rate": 1.5594291164427996e-06, "loss": -0.0039, "num_tokens": 7073171.0, "reward": 12.768733978271484, "reward_std": 3.78942608833313, "rewards/fitness_reward/mean": 7.052046775817871, "rewards/fitness_reward/std": 1.7487484216690063, "rewards/kidney_reward/mean": 2.329763889312744, "rewards/kidney_reward/std": 1.1252422332763672, "rewards/length2tails_reward/mean": 0.8165687918663025, "rewards/length2tails_reward/std": 0.2892093062400818, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.205265522003174, "rewards/thermo_reward/std": 1.6000819206237793, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 273.0625, "completions/mean_terminated_length": 273.0625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10471698269248009, "epoch": 1.6280000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.13619332015514374, "learning_rate": 1.5583658006296623e-06, "loss": -0.0018, "num_tokens": 7081941.0, "reward": 12.858470916748047, "reward_std": 2.571259021759033, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4575161933898926, "rewards/kidney_reward/std": 0.8109323978424072, "rewards/length2tails_reward/mean": 0.7841614484786987, "rewards/length2tails_reward/std": 0.300397127866745, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.861354112625122, "rewards/thermo_reward/std": 1.9132797718048096, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.96875, "completions/mean_terminated_length": 272.96875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.1012994833290577, "epoch": 1.63, "frac_reward_zero_std": 0.0, "grad_norm": 0.11487077176570892, "learning_rate": 1.5573015667242624e-06, "loss": 0.0013, "num_tokens": 7090708.0, "reward": 13.450803756713867, "reward_std": 1.1194764375686646, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7772794365882874, "rewards/length2tails_reward/std": 0.2860060930252075, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3401296138763428, "rewards/thermo_reward/std": 1.0613343715667725, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.78125, "completions/mean_terminated_length": 270.78125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09142725821584463, "epoch": 1.6320000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.11149288713932037, "learning_rate": 1.5562364164764647e-06, "loss": 0.0033, "num_tokens": 7099405.0, "reward": 12.100950241088867, "reward_std": 5.697356224060059, "rewards/fitness_reward/mean": 6.847840309143066, "rewards/fitness_reward/std": 2.3250811100006104, "rewards/kidney_reward/mean": 2.149076461791992, "rewards/kidney_reward/std": 1.5661954879760742, "rewards/length2tails_reward/mean": 0.6195429563522339, "rewards/length2tails_reward/std": 0.38124367594718933, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9420790672302246, "rewards/thermo_reward/std": 2.0163540840148926, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.46875, "completions/mean_terminated_length": 271.46875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0980948880314827, "epoch": 1.634, "frac_reward_zero_std": 0.0, "grad_norm": 0.08849485963582993, "learning_rate": 1.5551703516376416e-06, "loss": -0.0062, "num_tokens": 7108124.0, "reward": 12.822610855102539, "reward_std": 2.8037033081054688, "rewards/fitness_reward/mean": 6.9903459548950195, "rewards/fitness_reward/std": 1.7917176485061646, "rewards/kidney_reward/mean": 2.4557790756225586, "rewards/kidney_reward/std": 0.5509402751922607, "rewards/length2tails_reward/mean": 0.7287914752960205, "rewards/length2tails_reward/std": 0.28611382842063904, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.203606128692627, "rewards/thermo_reward/std": 1.2543768882751465, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.71875, "completions/mean_terminated_length": 271.71875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.09814401809126139, "epoch": 1.6360000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.626882791519165, "learning_rate": 1.554103373960668e-06, "loss": -0.0043, "num_tokens": 7116851.0, "reward": 11.834861755371094, "reward_std": 4.401232719421387, "rewards/fitness_reward/mean": 6.920083045959473, "rewards/fitness_reward/std": 1.9360997676849365, "rewards/kidney_reward/mean": 2.083052635192871, "rewards/kidney_reward/std": 1.3474140167236328, "rewards/length2tails_reward/mean": 0.7249131202697754, "rewards/length2tails_reward/std": 0.3437511622905731, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.659235954284668, "rewards/thermo_reward/std": 2.278928756713867, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.5625, "completions/mean_terminated_length": 273.5625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09511575661599636, "epoch": 1.638, "frac_reward_zero_std": 0.0, "grad_norm": 0.06473495066165924, "learning_rate": 1.5530354851999214e-06, "loss": -0.0049, "num_tokens": 7125637.0, "reward": 12.665140151977539, "reward_std": 3.4468960762023926, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.422769546508789, "rewards/kidney_reward/std": 0.7039707899093628, "rewards/length2tails_reward/mean": 0.8116496801376343, "rewards/length2tails_reward/std": 0.2855256497859955, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0081522464752197, "rewards/thermo_reward/std": 1.7059727907180786, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.59375, "completions/mean_terminated_length": 270.59375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08755200356245041, "epoch": 1.6400000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.07360640913248062, "learning_rate": 1.5519666871112763e-06, "loss": -0.0051, "num_tokens": 7134328.0, "reward": 13.005146026611328, "reward_std": 2.5275611877441406, "rewards/fitness_reward/mean": 7.009998321533203, "rewards/fitness_reward/std": 1.9866119623184204, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.6213452816009521, "rewards/length2tails_reward/std": 0.3339705765247345, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.261251449584961, "rewards/thermo_reward/std": 1.0655933618545532, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.21875, "completions/mean_terminated_length": 272.21875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10028571262955666, "epoch": 1.642, "frac_reward_zero_std": 0.0, "grad_norm": 0.07514243572950363, "learning_rate": 1.5508969814521024e-06, "loss": -0.0069, "num_tokens": 7143071.0, "reward": 13.104477882385254, "reward_std": 3.2510416507720947, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.511730670928955, "rewards/kidney_reward/std": 0.5132253766059875, "rewards/length2tails_reward/mean": 0.7408324480056763, "rewards/length2tails_reward/std": 0.2940310835838318, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.365610361099243, "rewards/thermo_reward/std": 1.4092084169387817, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.40625, "completions/mean_terminated_length": 270.40625, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "entropy": 0.09334253240376711, "epoch": 1.6440000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.3840697705745697, "learning_rate": 1.5498263699812623e-06, "loss": -0.0057, "num_tokens": 7151756.0, "reward": 12.753169059753418, "reward_std": 3.204993963241577, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.2599570751190186, "rewards/kidney_reward/std": 1.1070548295974731, "rewards/length2tails_reward/mean": 0.709437906742096, "rewards/length2tails_reward/std": 0.3272755742073059, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.01859188079834, "rewards/thermo_reward/std": 1.9252240657806396, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.9375, "completions/mean_terminated_length": 272.9375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10825805831700563, "epoch": 1.646, "frac_reward_zero_std": 0.0, "grad_norm": 0.15281492471694946, "learning_rate": 1.5487548544591073e-06, "loss": -0.0031, "num_tokens": 7160522.0, "reward": 13.315460205078125, "reward_std": 1.6204856634140015, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8107447028160095, "rewards/length2tails_reward/std": 0.2631240785121918, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1740806102752686, "rewards/thermo_reward/std": 1.5049045085906982, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10472884587943554, "epoch": 1.6480000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.07853143662214279, "learning_rate": 1.5476824366474754e-06, "loss": 0.0021, "num_tokens": 7169238.0, "reward": 13.464456558227539, "reward_std": 1.1619950532913208, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7010312676429749, "rewards/length2tails_reward/std": 0.2853972315788269, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3066892623901367, "rewards/thermo_reward/std": 1.1649514436721802, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.03125, "completions/mean_terminated_length": 272.03125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09919088799506426, "epoch": 1.65, "frac_reward_zero_std": 0.0, "grad_norm": 0.14531446993350983, "learning_rate": 1.5466091183096884e-06, "loss": -0.0041, "num_tokens": 7177975.0, "reward": 12.735475540161133, "reward_std": 2.1816654205322266, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.422218084335327, "rewards/kidney_reward/std": 0.594012975692749, "rewards/length2tails_reward/mean": 0.7504751682281494, "rewards/length2tails_reward/std": 0.272549033164978, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.777024984359741, "rewards/thermo_reward/std": 1.803603172302246, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09980232920497656, "epoch": 1.6520000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.9415404200553894, "learning_rate": 1.5455349012105486e-06, "loss": -0.0053, "num_tokens": 7186739.0, "reward": 12.596288681030273, "reward_std": 2.6109566688537598, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.3617115020751953, "rewards/kidney_reward/std": 0.9074901938438416, "rewards/length2tails_reward/mean": 0.7563230395317078, "rewards/length2tails_reward/std": 0.3114321231842041, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.7552688121795654, "rewards/thermo_reward/std": 1.6710541248321533, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.71875, "completions/mean_terminated_length": 272.71875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10263726208359003, "epoch": 1.654, "frac_reward_zero_std": 0.0, "grad_norm": 0.07845157384872437, "learning_rate": 1.5444597871163359e-06, "loss": 0.0014, "num_tokens": 7195498.0, "reward": 13.242258071899414, "reward_std": 1.4796823263168335, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.650642991065979, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.7627227902412415, "rewards/length2tails_reward/std": 0.3197879493236542, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.275416851043701, "rewards/thermo_reward/std": 1.0002073049545288, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.0625, "completions/mean_terminated_length": 272.0625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10090090520679951, "epoch": 1.6560000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.15500804781913757, "learning_rate": 1.5433837777948058e-06, "loss": -0.0038, "num_tokens": 7204236.0, "reward": 12.889111518859863, "reward_std": 2.009411573410034, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4222664833068848, "rewards/kidney_reward/std": 0.5937747359275818, "rewards/length2tails_reward/mean": 0.6735501289367676, "rewards/length2tails_reward/std": 0.3572605550289154, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9383044242858887, "rewards/thermo_reward/std": 1.5431135892868042, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.84375, "completions/mean_terminated_length": 271.84375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09415333159267902, "epoch": 1.658, "frac_reward_zero_std": 0.0, "grad_norm": 0.19570784270763397, "learning_rate": 1.5423068750151846e-06, "loss": -0.0063, "num_tokens": 7212967.0, "reward": 13.097809791564941, "reward_std": 2.0937092304229736, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5107388496398926, "rewards/kidney_reward/std": 0.5185781121253967, "rewards/length2tails_reward/mean": 0.6930172443389893, "rewards/length2tails_reward/std": 0.34632742404937744, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.056584119796753, "rewards/thermo_reward/std": 1.6953647136688232, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.25, "completions/mean_terminated_length": 271.25, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10683467425405979, "epoch": 1.6600000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.21493102610111237, "learning_rate": 1.5412290805481684e-06, "loss": -0.0015, "num_tokens": 7221679.0, "reward": 12.32304859161377, "reward_std": 5.012633323669434, "rewards/fitness_reward/mean": 7.001071453094482, "rewards/fitness_reward/std": 2.0371105670928955, "rewards/kidney_reward/mean": 2.3606696128845215, "rewards/kidney_reward/std": 1.2035049200057983, "rewards/length2tails_reward/mean": 0.7072513103485107, "rewards/length2tails_reward/std": 0.3007599115371704, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.7905826568603516, "rewards/thermo_reward/std": 2.080504894256592, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.21875, "completions/mean_terminated_length": 272.21875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09869087766855955, "epoch": 1.662, "frac_reward_zero_std": 0.0, "grad_norm": 0.13675719499588013, "learning_rate": 1.5401503961659201e-06, "loss": 0.0012, "num_tokens": 7230422.0, "reward": 13.259733200073242, "reward_std": 1.3868850469589233, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.79951411485672, "rewards/length2tails_reward/std": 0.20172974467277527, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1468350887298584, "rewards/thermo_reward/std": 1.2325853109359741, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.46875, "completions/mean_terminated_length": 270.46875, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 0.09462389722466469, "epoch": 1.6640000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 1.2891161441802979, "learning_rate": 1.5390708236420645e-06, "loss": -0.0078, "num_tokens": 7239109.0, "reward": 12.70787525177002, "reward_std": 2.7165496349334717, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.3758201599121094, "rewards/kidney_reward/std": 0.7904667854309082, "rewards/length2tails_reward/mean": 0.6734529733657837, "rewards/length2tails_reward/std": 0.3371480703353882, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8610341548919678, "rewards/thermo_reward/std": 1.8633620738983154, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09248152980580926, "epoch": 1.666, "frac_reward_zero_std": 0.0, "grad_norm": 0.08377497643232346, "learning_rate": 1.5379903647516877e-06, "loss": -0.0066, "num_tokens": 7247853.0, "reward": 13.391145706176758, "reward_std": 1.536651372909546, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7327719330787659, "rewards/length2tails_reward/std": 0.3565005660057068, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2575631141662598, "rewards/thermo_reward/std": 1.441807508468628, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.0968128452077508, "epoch": 1.6680000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.15227289497852325, "learning_rate": 1.5369090212713322e-06, "loss": -0.0027, "num_tokens": 7256617.0, "reward": 13.647794723510742, "reward_std": 0.6356459856033325, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7752425670623779, "rewards/length2tails_reward/std": 0.30526676774024963, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.50996470451355, "rewards/thermo_reward/std": 0.5615194439888, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.4375, "completions/mean_terminated_length": 272.4375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.1001192070543766, "epoch": 1.67, "frac_reward_zero_std": 0.0, "grad_norm": 0.1381293684244156, "learning_rate": 1.5358267949789964e-06, "loss": -0.0053, "num_tokens": 7265367.0, "reward": 12.299043655395508, "reward_std": 4.733181953430176, "rewards/fitness_reward/mean": 6.9439544677734375, "rewards/fitness_reward/std": 2.050342321395874, "rewards/kidney_reward/mean": 2.3449745178222656, "rewards/kidney_reward/std": 1.142852544784546, "rewards/length2tails_reward/mean": 0.7524625658988953, "rewards/length2tails_reward/std": 0.2887047231197357, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.834869146347046, "rewards/thermo_reward/std": 1.8959267139434814, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.96875, "completions/mean_terminated_length": 270.96875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09611952304840088, "epoch": 1.6720000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.09297409653663635, "learning_rate": 1.5347436876541295e-06, "loss": -0.0029, "num_tokens": 7274070.0, "reward": 13.16862964630127, "reward_std": 1.9593685865402222, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4035873413085938, "rewards/kidney_reward/std": 0.7815843224525452, "rewards/length2tails_reward/mean": 0.7112776637077332, "rewards/length2tails_reward/std": 0.25806063413619995, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.232728958129883, "rewards/thermo_reward/std": 1.303803563117981, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.09741265419870615, "epoch": 1.674, "frac_reward_zero_std": 0.0, "grad_norm": 0.09869790822267532, "learning_rate": 1.533659701077629e-06, "loss": -0.0073, "num_tokens": 7282818.0, "reward": 13.390752792358398, "reward_std": 1.6376488208770752, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7550690174102783, "rewards/length2tails_reward/std": 0.30704745650291443, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2549397945404053, "rewards/thermo_reward/std": 1.5037169456481934, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.5625, "completions/mean_terminated_length": 272.5625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09892761148512363, "epoch": 1.6760000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.05691608041524887, "learning_rate": 1.5325748370318383e-06, "loss": -0.0045, "num_tokens": 7291572.0, "reward": 12.643072128295898, "reward_std": 3.3166003227233887, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.4200446605682373, "rewards/kidney_reward/std": 0.7146202325820923, "rewards/length2tails_reward/mean": 0.7664644718170166, "rewards/length2tails_reward/std": 0.29053762555122375, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9933269023895264, "rewards/thermo_reward/std": 1.3843075037002563, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.28125, "completions/mean_terminated_length": 273.28125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10477215051651001, "epoch": 1.678, "frac_reward_zero_std": 0.0, "grad_norm": 0.15455517172813416, "learning_rate": 1.5314890973005445e-06, "loss": 0.0024, "num_tokens": 7300349.0, "reward": 13.205035209655762, "reward_std": 1.625687599182129, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.8004459142684937, "rewards/length2tails_reward/std": 0.2789980173110962, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.119403600692749, "rewards/thermo_reward/std": 1.4113643169403076, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 281.4375, "completions/mean_terminated_length": 281.4375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.1059467475861311, "epoch": 1.6800000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 6.29008674621582, "learning_rate": 1.530402483668973e-06, "loss": 0.1573, "num_tokens": 7309387.0, "reward": 12.298429489135742, "reward_std": 5.2238335609436035, "rewards/fitness_reward/mean": 6.982968330383301, "rewards/fitness_reward/std": 2.1395151615142822, "rewards/kidney_reward/mean": 2.219698905944824, "rewards/kidney_reward/std": 1.50632905960083, "rewards/length2tails_reward/mean": 0.7398964166641235, "rewards/length2tails_reward/std": 0.3255484402179718, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9217729568481445, "rewards/thermo_reward/std": 2.0766680240631104, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.125, "completions/mean_terminated_length": 271.125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.09004503022879362, "epoch": 1.682, "frac_reward_zero_std": 0.0, "grad_norm": 0.07574941962957382, "learning_rate": 1.5293149979237875e-06, "loss": -0.0046, "num_tokens": 7318095.0, "reward": 13.034381866455078, "reward_std": 2.6836977005004883, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.650642991065979, "rewards/kidney_reward/mean": 2.4217729568481445, "rewards/kidney_reward/std": 1.0101513862609863, "rewards/length2tails_reward/mean": 0.6856233477592468, "rewards/length2tails_reward/std": 0.33994343876838684, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.197880744934082, "rewards/thermo_reward/std": 1.4554771184921265, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.96875, "completions/mean_terminated_length": 272.96875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0994929876178503, "epoch": 1.6840000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.24888205528259277, "learning_rate": 1.5282266418530844e-06, "loss": 0.0007, "num_tokens": 7326862.0, "reward": 12.968977928161621, "reward_std": 4.216531276702881, "rewards/fitness_reward/mean": 7.013382911682129, "rewards/fitness_reward/std": 1.9674652814865112, "rewards/kidney_reward/mean": 2.4493489265441895, "rewards/kidney_reward/std": 1.0020034313201904, "rewards/length2tails_reward/mean": 0.8029117584228516, "rewards/length2tails_reward/std": 0.2570558190345764, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3259544372558594, "rewards/thermo_reward/std": 1.3275809288024902, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 273.90625, "completions/mean_terminated_length": 273.90625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.13375951629132032, "epoch": 1.686, "frac_reward_zero_std": 0.0, "grad_norm": 0.29555225372314453, "learning_rate": 1.5271374172463922e-06, "loss": -0.0007, "num_tokens": 7335659.0, "reward": 13.37346076965332, "reward_std": 1.3563597202301025, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7223860025405884, "rewards/length2tails_reward/std": 0.30389323830604553, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2409167289733887, "rewards/thermo_reward/std": 1.2600083351135254, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.125, "completions/mean_terminated_length": 273.125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09282128792256117, "epoch": 1.688, "frac_reward_zero_std": 0.0, "grad_norm": 0.11549926549196243, "learning_rate": 1.526047325894667e-06, "loss": -0.0006, "num_tokens": 7344431.0, "reward": 13.49197006225586, "reward_std": 1.3360706567764282, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8050989508628845, "rewards/length2tails_reward/std": 0.2730928063392639, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3511548042297363, "rewards/thermo_reward/std": 1.1984219551086426, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.40625, "completions/mean_terminated_length": 273.40625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.11189008224755526, "epoch": 1.69, "frac_reward_zero_std": 0.0, "grad_norm": 0.11807180196046829, "learning_rate": 1.5249563695902903e-06, "loss": -0.0042, "num_tokens": 7353212.0, "reward": 13.080742835998535, "reward_std": 2.1417932510375977, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5310442447662354, "rewards/kidney_reward/std": 0.539863646030426, "rewards/length2tails_reward/mean": 0.8166994452476501, "rewards/length2tails_reward/std": 0.2542852759361267, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.006843090057373, "rewards/thermo_reward/std": 1.7348278760910034, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.09375, "completions/mean_terminated_length": 272.09375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10713509004563093, "epoch": 1.692, "frac_reward_zero_std": 0.0, "grad_norm": 0.10242803394794464, "learning_rate": 1.5238645501270652e-06, "loss": -0.001, "num_tokens": 7361951.0, "reward": 12.868669509887695, "reward_std": 1.8826740980148315, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5312321186065674, "rewards/kidney_reward/std": 0.5388016104698181, "rewards/length2tails_reward/mean": 0.7111541032791138, "rewards/length2tails_reward/std": 0.3376815617084503, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8051371574401855, "rewards/thermo_reward/std": 1.533287525177002, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.5, "completions/mean_terminated_length": 271.5, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09818495530635118, "epoch": 1.694, "frac_reward_zero_std": 0.0, "grad_norm": 0.1153191477060318, "learning_rate": 1.5227718693002153e-06, "loss": -0.0083, "num_tokens": 7370671.0, "reward": 12.53122615814209, "reward_std": 3.189180850982666, "rewards/fitness_reward/mean": 7.188657760620117, "rewards/fitness_reward/std": 0.7179933190345764, "rewards/kidney_reward/mean": 2.359818458557129, "rewards/kidney_reward/std": 0.848675012588501, "rewards/length2tails_reward/mean": 0.7231026291847229, "rewards/length2tails_reward/std": 0.3215695023536682, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8104398250579834, "rewards/thermo_reward/std": 1.989101529121399, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.71875, "completions/mean_terminated_length": 272.71875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09497405029833317, "epoch": 1.696, "frac_reward_zero_std": 0.0, "grad_norm": 0.10651351511478424, "learning_rate": 1.5216783289063785e-06, "loss": -0.0064, "num_tokens": 7379430.0, "reward": 12.566158294677734, "reward_std": 3.863691806793213, "rewards/fitness_reward/mean": 6.99554443359375, "rewards/fitness_reward/std": 1.7628074884414673, "rewards/kidney_reward/mean": 2.3516533374786377, "rewards/kidney_reward/std": 1.0224820375442505, "rewards/length2tails_reward/mean": 0.7389246225357056, "rewards/length2tails_reward/std": 0.33149829506874084, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0450682640075684, "rewards/thermo_reward/std": 1.6298531293869019, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.875, "completions/mean_terminated_length": 271.875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09245220106095076, "epoch": 1.698, "frac_reward_zero_std": 0.0, "grad_norm": 0.06719689071178436, "learning_rate": 1.5205839307436086e-06, "loss": -0.0062, "num_tokens": 7388162.0, "reward": 12.616239547729492, "reward_std": 4.165755748748779, "rewards/fitness_reward/mean": 6.682142734527588, "rewards/fitness_reward/std": 2.684154748916626, "rewards/kidney_reward/mean": 2.4843716621398926, "rewards/kidney_reward/std": 0.5299732089042664, "rewards/length2tails_reward/mean": 0.6952399015426636, "rewards/length2tails_reward/std": 0.328195184469223, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2802019119262695, "rewards/thermo_reward/std": 1.1215753555297852, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.1875, "completions/mean_terminated_length": 272.1875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10125975962728262, "epoch": 1.7, "frac_reward_zero_std": 0.0, "grad_norm": 0.6502317190170288, "learning_rate": 1.5194886766113672e-06, "loss": -0.0056, "num_tokens": 7396904.0, "reward": 12.429765701293945, "reward_std": 3.9128546714782715, "rewards/fitness_reward/mean": 6.938035011291504, "rewards/fitness_reward/std": 1.78042471408844, "rewards/kidney_reward/mean": 2.3061063289642334, "rewards/kidney_reward/std": 0.9397987723350525, "rewards/length2tails_reward/mean": 0.7385504245758057, "rewards/length2tails_reward/std": 0.3328675329685211, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0117697715759277, "rewards/thermo_reward/std": 1.61867094039917, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.0, "completions/mean_terminated_length": 273.0, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09814795758575201, "epoch": 1.702, "frac_reward_zero_std": 0.0, "grad_norm": 0.09595108777284622, "learning_rate": 1.5183925683105251e-06, "loss": -0.0025, "num_tokens": 7405672.0, "reward": 13.521875381469727, "reward_std": 1.1724050045013428, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7721210718154907, "rewards/length2tails_reward/std": 0.30832090973854065, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3569979667663574, "rewards/thermo_reward/std": 1.1689194440841675, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.96875, "completions/mean_terminated_length": 272.96875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10071547608822584, "epoch": 1.704, "frac_reward_zero_std": 0.0, "grad_norm": 0.07803252339363098, "learning_rate": 1.5172956076433568e-06, "loss": -0.0051, "num_tokens": 7414439.0, "reward": 13.553016662597656, "reward_std": 1.0260266065597534, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7786349058151245, "rewards/length2tails_reward/std": 0.3229070007801056, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.442206621170044, "rewards/thermo_reward/std": 0.9379639029502869, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.75, "completions/mean_terminated_length": 272.75, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09173226915299892, "epoch": 1.706, "frac_reward_zero_std": 0.0, "grad_norm": 0.10252796113491058, "learning_rate": 1.5161977964135387e-06, "loss": -0.0048, "num_tokens": 7423199.0, "reward": 13.331336975097656, "reward_std": 1.8175561428070068, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.475644588470459, "rewards/kidney_reward/std": 0.5754398703575134, "rewards/length2tails_reward/mean": 0.7746952772140503, "rewards/length2tails_reward/std": 0.2770718038082123, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.317038059234619, "rewards/thermo_reward/std": 1.2796655893325806, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.5, "completions/mean_terminated_length": 271.5, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09569068439304829, "epoch": 1.708, "frac_reward_zero_std": 0.0, "grad_norm": 0.19964933395385742, "learning_rate": 1.515099136426145e-06, "loss": 0.0002, "num_tokens": 7431919.0, "reward": 13.394378662109375, "reward_std": 2.037860155105591, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.50772762298584, "rewards/kidney_reward/std": 0.671763002872467, "rewards/length2tails_reward/mean": 0.7298898696899414, "rewards/length2tails_reward/std": 0.26895391941070557, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4099857807159424, "rewards/thermo_reward/std": 1.0960407257080078, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.0, "completions/mean_terminated_length": 271.0, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09732735715806484, "epoch": 1.71, "frac_reward_zero_std": 0.0, "grad_norm": 0.08672071248292923, "learning_rate": 1.5139996294876465e-06, "loss": -0.0021, "num_tokens": 7440623.0, "reward": 13.514305114746094, "reward_std": 1.774261474609375, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5317673683166504, "rewards/kidney_reward/std": 0.5357728600502014, "rewards/length2tails_reward/mean": 0.6585451364517212, "rewards/length2tails_reward/std": 0.3462117314338684, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4554989337921143, "rewards/thermo_reward/std": 1.2565979957580566, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 269.75, "completions/mean_terminated_length": 269.75, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "entropy": 0.10142287611961365, "epoch": 1.712, "frac_reward_zero_std": 0.0, "grad_norm": 0.09674447774887085, "learning_rate": 1.5128992774059062e-06, "loss": -0.0026, "num_tokens": 7449287.0, "reward": 12.82474136352539, "reward_std": 2.4676270484924316, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.7117809057235718, "rewards/length2tails_reward/std": 0.26468193531036377, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.747976541519165, "rewards/thermo_reward/std": 2.237440347671509, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 272.15625, "completions/mean_terminated_length": 272.15625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09790748357772827, "epoch": 1.714, "frac_reward_zero_std": 0.0, "grad_norm": 0.09507341682910919, "learning_rate": 1.511798081990176e-06, "loss": -0.0018, "num_tokens": 7458028.0, "reward": 13.191679954528809, "reward_std": 1.9008152484893799, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4973230361938477, "rewards/kidney_reward/std": 0.5914480686187744, "rewards/length2tails_reward/mean": 0.6992174386978149, "rewards/length2tails_reward/std": 0.3379260003566742, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.16325044631958, "rewards/thermo_reward/std": 1.4118142127990723, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.4375, "completions/mean_terminated_length": 271.4375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09885876066982746, "epoch": 1.716, "frac_reward_zero_std": 0.0, "grad_norm": 0.07631216198205948, "learning_rate": 1.510696045051096e-06, "loss": -0.0075, "num_tokens": 7466746.0, "reward": 13.255902290344238, "reward_std": 1.6920017004013062, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4815330505371094, "rewards/kidney_reward/std": 0.5446864366531372, "rewards/length2tails_reward/mean": 0.677985429763794, "rewards/length2tails_reward/std": 0.34782329201698303, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.24538516998291, "rewards/thermo_reward/std": 1.4120094776153564, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.9375, "completions/mean_terminated_length": 270.9375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10496858600527048, "epoch": 1.718, "frac_reward_zero_std": 0.0, "grad_norm": 0.12887302041053772, "learning_rate": 1.5095931684006882e-06, "loss": 0.0028, "num_tokens": 7475448.0, "reward": 13.230815887451172, "reward_std": 1.7123416662216187, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.6752799153327942, "rewards/length2tails_reward/std": 0.31890955567359924, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1303417682647705, "rewards/thermo_reward/std": 1.5595749616622925, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 271.09375, "completions/mean_terminated_length": 271.09375, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "entropy": 0.10163374803960323, "epoch": 1.72, "frac_reward_zero_std": 0.0, "grad_norm": 1.293250560760498, "learning_rate": 1.5084894538523566e-06, "loss": 0.0121, "num_tokens": 7484155.0, "reward": 12.42259407043457, "reward_std": 3.9458916187286377, "rewards/fitness_reward/mean": 7.188657283782959, "rewards/fitness_reward/std": 0.7179933190345764, "rewards/kidney_reward/mean": 2.3019797801971436, "rewards/kidney_reward/std": 1.239792823791504, "rewards/length2tails_reward/mean": 0.7115887999534607, "rewards/length2tails_reward/std": 0.3159739375114441, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.760798454284668, "rewards/thermo_reward/std": 2.2771968841552734, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.53125, "completions/mean_terminated_length": 271.53125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10072409827262163, "epoch": 1.722, "frac_reward_zero_std": 0.0, "grad_norm": 0.09440076351165771, "learning_rate": 1.507384903220882e-06, "loss": -0.0049, "num_tokens": 7492876.0, "reward": 13.463882446289062, "reward_std": 1.055253028869629, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7122754454612732, "rewards/length2tails_reward/std": 0.3253532946109772, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.304990291595459, "rewards/thermo_reward/std": 1.043897032737732, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.84375, "completions/mean_terminated_length": 272.84375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10071540251374245, "epoch": 1.724, "frac_reward_zero_std": 0.0, "grad_norm": 0.134958416223526, "learning_rate": 1.506279518322421e-06, "loss": 0.0022, "num_tokens": 7501639.0, "reward": 12.194796562194824, "reward_std": 3.917198657989502, "rewards/fitness_reward/mean": 6.863818168640137, "rewards/fitness_reward/std": 2.238445281982422, "rewards/kidney_reward/mean": 2.2814149856567383, "rewards/kidney_reward/std": 0.9608786106109619, "rewards/length2tails_reward/mean": 0.7513444423675537, "rewards/length2tails_reward/std": 0.31943386793136597, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.874429702758789, "rewards/thermo_reward/std": 1.6127493381500244, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 272.625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09387815557420254, "epoch": 1.726, "frac_reward_zero_std": 0.0, "grad_norm": 0.08303510397672653, "learning_rate": 1.5051733009745012e-06, "loss": -0.0002, "num_tokens": 7510395.0, "reward": 13.363080978393555, "reward_std": 2.55647349357605, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.4396750926971436, "rewards/kidney_reward/std": 0.9102076292037964, "rewards/length2tails_reward/mean": 0.8173313140869141, "rewards/length2tails_reward/std": 0.21922850608825684, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.437997817993164, "rewards/thermo_reward/std": 1.3501050472259521, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 272.5625, "completions/mean_terminated_length": 272.5625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.10419859923422337, "epoch": 1.728, "frac_reward_zero_std": 0.0, "grad_norm": 0.1179058626294136, "learning_rate": 1.5040662529960187e-06, "loss": 0.0029, "num_tokens": 7519149.0, "reward": 13.316216468811035, "reward_std": 1.682897686958313, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.456533432006836, "rewards/kidney_reward/std": 0.5471765995025635, "rewards/length2tails_reward/mean": 0.8196427822113037, "rewards/length2tails_reward/std": 0.21029697358608246, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3165335655212402, "rewards/thermo_reward/std": 1.176936149597168, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.96875, "completions/mean_terminated_length": 271.96875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10365048982203007, "epoch": 1.73, "frac_reward_zero_std": 0.0, "grad_norm": 0.12642066180706024, "learning_rate": 1.5029583762072355e-06, "loss": -0.0025, "num_tokens": 7527884.0, "reward": 11.681166648864746, "reward_std": 4.486652374267578, "rewards/fitness_reward/mean": 7.049827575683594, "rewards/fitness_reward/std": 1.7613033056259155, "rewards/kidney_reward/mean": 2.210993766784668, "rewards/kidney_reward/std": 1.164110779762268, "rewards/length2tails_reward/mean": 0.702089786529541, "rewards/length2tails_reward/std": 0.3295642137527466, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.250135898590088, "rewards/thermo_reward/std": 2.3425369262695312, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.5625, "completions/mean_terminated_length": 273.5625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10005826782435179, "epoch": 1.732, "frac_reward_zero_std": 0.0, "grad_norm": 0.2355157732963562, "learning_rate": 1.5018496724297775e-06, "loss": -0.0039, "num_tokens": 7536670.0, "reward": 12.99422550201416, "reward_std": 2.591430425643921, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.46018648147583, "rewards/kidney_reward/std": 0.6573600769042969, "rewards/length2tails_reward/mean": 0.8218222856521606, "rewards/length2tails_reward/std": 0.2646983563899994, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0481810569763184, "rewards/thermo_reward/std": 1.797224998474121, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.40625, "completions/mean_terminated_length": 272.40625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0915538715198636, "epoch": 1.734, "frac_reward_zero_std": 0.0, "grad_norm": 0.12455841153860092, "learning_rate": 1.5007401434866288e-06, "loss": 0.002, "num_tokens": 7545419.0, "reward": 13.686896324157715, "reward_std": 0.6071659326553345, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7674039602279663, "rewards/length2tails_reward/std": 0.2526814043521881, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5498504638671875, "rewards/thermo_reward/std": 0.5360844731330872, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.90625, "completions/mean_terminated_length": 271.90625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09496522974222898, "epoch": 1.736, "frac_reward_zero_std": 0.0, "grad_norm": 0.1116664856672287, "learning_rate": 1.4996297912021315e-06, "loss": 0.0045, "num_tokens": 7554152.0, "reward": 13.388604164123535, "reward_std": 1.7250062227249146, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5111966133117676, "rewards/kidney_reward/std": 0.5161065459251404, "rewards/length2tails_reward/mean": 0.7519122362136841, "rewards/length2tails_reward/std": 0.28855007886886597, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.341031074523926, "rewards/thermo_reward/std": 1.2499605417251587, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.4375, "completions/mean_terminated_length": 272.4375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08507314044982195, "epoch": 1.738, "frac_reward_zero_std": 0.0, "grad_norm": 1.9078298807144165, "learning_rate": 1.4985186174019805e-06, "loss": -0.0003, "num_tokens": 7562902.0, "reward": 11.907567977905273, "reward_std": 5.713685512542725, "rewards/fitness_reward/mean": 6.546481132507324, "rewards/fitness_reward/std": 2.9839866161346436, "rewards/kidney_reward/mean": 2.199110984802246, "rewards/kidney_reward/std": 1.447792410850525, "rewards/length2tails_reward/mean": 0.7063984870910645, "rewards/length2tails_reward/std": 0.3167133331298828, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9913363456726074, "rewards/thermo_reward/std": 1.8961188793182373, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 271.875, "completions/mean_terminated_length": 271.875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09846653696149588, "epoch": 1.74, "frac_reward_zero_std": 0.0, "grad_norm": 0.12279684096574783, "learning_rate": 1.4974066239132218e-06, "loss": -0.0032, "num_tokens": 7571634.0, "reward": 12.903898239135742, "reward_std": 2.871889352798462, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.3835763931274414, "rewards/kidney_reward/std": 0.8628641366958618, "rewards/length2tails_reward/mean": 0.7258918285369873, "rewards/length2tails_reward/std": 0.3222261369228363, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0440564155578613, "rewards/thermo_reward/std": 1.848998785018921, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.75, "completions/mean_terminated_length": 272.75, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10395078919827938, "epoch": 1.742, "frac_reward_zero_std": 0.0, "grad_norm": 0.1666652113199234, "learning_rate": 1.4962938125642501e-06, "loss": 0.0005, "num_tokens": 7580394.0, "reward": 12.472291946411133, "reward_std": 4.9836344718933105, "rewards/fitness_reward/mean": 6.719928741455078, "rewards/fitness_reward/std": 2.5253584384918213, "rewards/kidney_reward/mean": 2.338550567626953, "rewards/kidney_reward/std": 1.087242603302002, "rewards/length2tails_reward/mean": 0.7961379289627075, "rewards/length2tails_reward/std": 0.26709046959877014, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2341983318328857, "rewards/thermo_reward/std": 1.4723800420761108, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.71875, "completions/mean_terminated_length": 272.71875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.09795780386775732, "epoch": 1.744, "frac_reward_zero_std": 0.0, "grad_norm": 0.34581565856933594, "learning_rate": 1.495180185184803e-06, "loss": 0.0037, "num_tokens": 7589153.0, "reward": 12.669710159301758, "reward_std": 5.741780757904053, "rewards/fitness_reward/mean": 6.930096626281738, "rewards/fitness_reward/std": 2.438606023788452, "rewards/kidney_reward/mean": 2.3235361576080322, "rewards/kidney_reward/std": 1.5616281032562256, "rewards/length2tails_reward/mean": 0.7937979102134705, "rewards/length2tails_reward/std": 0.24576689302921295, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2366983890533447, "rewards/thermo_reward/std": 1.8013713359832764, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.84375, "completions/mean_terminated_length": 271.84375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.1063917875289917, "epoch": 1.746, "frac_reward_zero_std": 0.0, "grad_norm": 0.13437135517597198, "learning_rate": 1.4940657436059616e-06, "loss": 0.0014, "num_tokens": 7597884.0, "reward": 12.676504135131836, "reward_std": 4.748298168182373, "rewards/fitness_reward/mean": 7.001252174377441, "rewards/fitness_reward/std": 2.0360865592956543, "rewards/kidney_reward/mean": 2.4072861671447754, "rewards/kidney_reward/std": 1.2399449348449707, "rewards/length2tails_reward/mean": 0.7146443128585815, "rewards/length2tails_reward/std": 0.2839556634426117, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0964999198913574, "rewards/thermo_reward/std": 1.6927036046981812, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.6875, "completions/mean_terminated_length": 272.6875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.11050812620669603, "epoch": 1.748, "frac_reward_zero_std": 0.0, "grad_norm": 0.1196875348687172, "learning_rate": 1.492950489660145e-06, "loss": -0.0013, "num_tokens": 7606642.0, "reward": 12.8109712600708, "reward_std": 2.9641854763031006, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.3206534385681152, "rewards/kidney_reward/std": 0.9683699607849121, "rewards/length2tails_reward/mean": 0.7905367612838745, "rewards/length2tails_reward/std": 0.2640564441680908, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0075883865356445, "rewards/thermo_reward/std": 1.8859575986862183, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09981518425047398, "epoch": 1.75, "frac_reward_zero_std": 0.0, "grad_norm": 0.1955115646123886, "learning_rate": 1.4918344251811078e-06, "loss": 0.0011, "num_tokens": 7615390.0, "reward": 12.619268417358398, "reward_std": 4.635945796966553, "rewards/fitness_reward/mean": 7.005709171295166, "rewards/fitness_reward/std": 2.0108749866485596, "rewards/kidney_reward/mean": 2.356231451034546, "rewards/kidney_reward/std": 1.0809255838394165, "rewards/length2tails_reward/mean": 0.7344810366630554, "rewards/length2tails_reward/std": 0.33714422583580017, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0838804244995117, "rewards/thermo_reward/std": 1.7415902614593506, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 272.65625, "completions/mean_terminated_length": 272.65625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11094592232257128, "epoch": 1.752, "frac_reward_zero_std": 0.0, "grad_norm": 0.08621875941753387, "learning_rate": 1.490717552003938e-06, "loss": -0.0012, "num_tokens": 7624147.0, "reward": 13.21161937713623, "reward_std": 1.822740077972412, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7981569170951843, "rewards/length2tails_reward/std": 0.24679215252399445, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0988576412200928, "rewards/thermo_reward/std": 1.6984132528305054, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.40625, "completions/mean_terminated_length": 272.40625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10080702416598797, "epoch": 1.754, "frac_reward_zero_std": 0.0, "grad_norm": 0.15915358066558838, "learning_rate": 1.4895998719650523e-06, "loss": 0.0016, "num_tokens": 7632896.0, "reward": 12.766881942749023, "reward_std": 4.318146228790283, "rewards/fitness_reward/mean": 6.9977922439575195, "rewards/fitness_reward/std": 2.055662155151367, "rewards/kidney_reward/mean": 2.4041614532470703, "rewards/kidney_reward/std": 0.9624440670013428, "rewards/length2tails_reward/mean": 0.7578998804092407, "rewards/length2tails_reward/std": 0.274631142616272, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.189138889312744, "rewards/thermo_reward/std": 1.492705225944519, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.4375, "completions/mean_terminated_length": 272.4375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.0937582403421402, "epoch": 1.756, "frac_reward_zero_std": 0.0, "grad_norm": 0.0773591473698616, "learning_rate": 1.4884813869021952e-06, "loss": -0.0012, "num_tokens": 7641646.0, "reward": 13.149742126464844, "reward_std": 1.9497835636138916, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.436121702194214, "rewards/kidney_reward/std": 0.6512018442153931, "rewards/length2tails_reward/mean": 0.7778726816177368, "rewards/length2tails_reward/std": 0.24082037806510925, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.174647808074951, "rewards/thermo_reward/std": 1.4494233131408691, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.21875, "completions/mean_terminated_length": 272.21875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09923964831978083, "epoch": 1.758, "frac_reward_zero_std": 0.0, "grad_norm": 0.13162273168563843, "learning_rate": 1.4873620986544347e-06, "loss": 0.0015, "num_tokens": 7650389.0, "reward": 12.932975769042969, "reward_std": 3.2268340587615967, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.650642991065979, "rewards/kidney_reward/mean": 2.381955623626709, "rewards/kidney_reward/std": 1.0001392364501953, "rewards/length2tails_reward/mean": 0.7600358724594116, "rewards/length2tails_reward/std": 0.27038806676864624, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1288509368896484, "rewards/thermo_reward/std": 1.8431106805801392, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.34375, "completions/mean_terminated_length": 271.34375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09573240112513304, "epoch": 1.76, "frac_reward_zero_std": 0.0, "grad_norm": 0.06183909624814987, "learning_rate": 1.4862420090621581e-06, "loss": -0.0036, "num_tokens": 7659104.0, "reward": 13.072014808654785, "reward_std": 3.2107491493225098, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.511730670928955, "rewards/kidney_reward/std": 0.5132253766059875, "rewards/length2tails_reward/mean": 0.6726909875869751, "rewards/length2tails_reward/std": 0.34120577573776245, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.339961528778076, "rewards/thermo_reward/std": 1.349816918373108, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.59375, "completions/mean_terminated_length": 271.59375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09770231042057276, "epoch": 1.762, "frac_reward_zero_std": 0.0, "grad_norm": 0.08552416414022446, "learning_rate": 1.485121119967072e-06, "loss": -0.0026, "num_tokens": 7667827.0, "reward": 13.009862899780273, "reward_std": 2.53602933883667, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.460869550704956, "rewards/kidney_reward/std": 0.792319655418396, "rewards/length2tails_reward/mean": 0.7326198816299438, "rewards/length2tails_reward/std": 0.3129405975341797, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0720553398132324, "rewards/thermo_reward/std": 1.6470363140106201, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.78125, "completions/mean_terminated_length": 272.78125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.10478513780981302, "epoch": 1.764, "frac_reward_zero_std": 0.0, "grad_norm": 0.1234990730881691, "learning_rate": 1.4839994332121968e-06, "loss": -0.0003, "num_tokens": 7676588.0, "reward": 13.49679946899414, "reward_std": 1.194043755531311, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.792144775390625, "rewards/length2tails_reward/std": 0.24374578893184662, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3572797775268555, "rewards/thermo_reward/std": 1.16750168800354, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.59375, "completions/mean_terminated_length": 271.59375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09663060214370489, "epoch": 1.766, "frac_reward_zero_std": 0.0, "grad_norm": 0.20788384974002838, "learning_rate": 1.4828769506418643e-06, "loss": 0.002, "num_tokens": 7685311.0, "reward": 13.174077987670898, "reward_std": 1.7398223876953125, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4979214668273926, "rewards/kidney_reward/std": 0.5881804823875427, "rewards/length2tails_reward/mean": 0.7213119864463806, "rewards/length2tails_reward/std": 0.3026640713214874, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.142841339111328, "rewards/thermo_reward/std": 1.3288516998291016, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.15625, "completions/mean_terminated_length": 273.15625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09734796825796366, "epoch": 1.768, "frac_reward_zero_std": 0.0, "grad_norm": 0.1145319864153862, "learning_rate": 1.4817536741017151e-06, "loss": 0.0006, "num_tokens": 7694084.0, "reward": 13.582418441772461, "reward_std": 0.7006555795669556, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7939411401748657, "rewards/length2tails_reward/std": 0.2659618556499481, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.470078706741333, "rewards/thermo_reward/std": 0.5830413699150085, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10007060971111059, "epoch": 1.77, "frac_reward_zero_std": 0.0, "grad_norm": 0.07956361025571823, "learning_rate": 1.4806296054386957e-06, "loss": -0.0044, "num_tokens": 7702848.0, "reward": 13.668158531188965, "reward_std": 1.0154482126235962, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7641856670379639, "rewards/length2tails_reward/std": 0.33650773763656616, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5314338207244873, "rewards/thermo_reward/std": 0.8651680946350098, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 271.28125, "completions/mean_terminated_length": 271.28125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09572915825992823, "epoch": 1.772, "frac_reward_zero_std": 0.0, "grad_norm": 0.5946225523948669, "learning_rate": 1.479504746501054e-06, "loss": -0.0036, "num_tokens": 7711561.0, "reward": 13.458148956298828, "reward_std": 1.678314208984375, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.538149356842041, "rewards/kidney_reward/std": 0.4996722936630249, "rewards/length2tails_reward/mean": 0.6941894292831421, "rewards/length2tails_reward/std": 0.3023573160171509, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4469056129455566, "rewards/thermo_reward/std": 0.9156622290611267, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.1875, "completions/mean_terminated_length": 272.1875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.099006243981421, "epoch": 1.774, "frac_reward_zero_std": 0.0, "grad_norm": 0.09899494796991348, "learning_rate": 1.4783790991383378e-06, "loss": -0.0013, "num_tokens": 7720303.0, "reward": 13.350641250610352, "reward_std": 1.644244909286499, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5380842685699463, "rewards/kidney_reward/std": 0.5000393986701965, "rewards/length2tails_reward/mean": 0.7450613379478455, "rewards/length2tails_reward/std": 0.29141712188720703, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.276865243911743, "rewards/thermo_reward/std": 1.2720036506652832, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10529050510376692, "epoch": 1.776, "frac_reward_zero_std": 0.0, "grad_norm": 0.0689932107925415, "learning_rate": 1.4772526652013922e-06, "loss": -0.0061, "num_tokens": 7729047.0, "reward": 12.381019592285156, "reward_std": 3.4639840126037598, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.185664176940918, "rewards/kidney_reward/std": 1.2540359497070312, "rewards/length2tails_reward/mean": 0.7414818406105042, "rewards/length2tails_reward/std": 0.3098289966583252, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.717531681060791, "rewards/thermo_reward/std": 2.14951491355896, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 272.4375, "completions/mean_terminated_length": 272.4375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09431830234825611, "epoch": 1.778, "frac_reward_zero_std": 0.0, "grad_norm": 0.10811066627502441, "learning_rate": 1.4761254465423536e-06, "loss": -0.0004, "num_tokens": 7737797.0, "reward": 12.986416816711426, "reward_std": 2.2744932174682617, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.650642991065979, "rewards/kidney_reward/mean": 2.3942477703094482, "rewards/kidney_reward/std": 0.7171282172203064, "rewards/length2tails_reward/mean": 0.784565806388855, "rewards/length2tails_reward/std": 0.2758873701095581, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.167546272277832, "rewards/thermo_reward/std": 1.4129695892333984, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 268.9375, "completions/mean_terminated_length": 268.9375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.09791536442935467, "epoch": 1.78, "frac_reward_zero_std": 0.0, "grad_norm": 0.6448939442634583, "learning_rate": 1.474997445014651e-06, "loss": -0.04, "num_tokens": 7746435.0, "reward": 11.629158020019531, "reward_std": 5.494712829589844, "rewards/fitness_reward/mean": 6.617328643798828, "rewards/fitness_reward/std": 2.9274072647094727, "rewards/kidney_reward/mean": 2.2886364459991455, "rewards/kidney_reward/std": 1.2951936721801758, "rewards/length2tails_reward/mean": 0.6833776831626892, "rewards/length2tails_reward/std": 0.3136868476867676, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.554854393005371, "rewards/thermo_reward/std": 2.314518928527832, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.40625, "completions/mean_terminated_length": 271.40625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09088954981416464, "epoch": 1.782, "frac_reward_zero_std": 0.0, "grad_norm": 0.13576436042785645, "learning_rate": 1.4738686624729987e-06, "loss": -0.0015, "num_tokens": 7755152.0, "reward": 12.47293472290039, "reward_std": 3.1947779655456543, "rewards/fitness_reward/mean": 7.131148338317871, "rewards/fitness_reward/std": 0.7751544713973999, "rewards/kidney_reward/mean": 2.206810474395752, "rewards/kidney_reward/std": 1.0620572566986084, "rewards/length2tails_reward/mean": 0.6976714134216309, "rewards/length2tails_reward/std": 0.3364195227622986, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9652092456817627, "rewards/thermo_reward/std": 1.6839019060134888, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.4375, "completions/mean_terminated_length": 272.4375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09869909752160311, "epoch": 1.784, "frac_reward_zero_std": 0.0, "grad_norm": 0.10662391781806946, "learning_rate": 1.472739100773396e-06, "loss": 0.0029, "num_tokens": 7763902.0, "reward": 13.120617866516113, "reward_std": 1.752073884010315, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4723806381225586, "rewards/kidney_reward/std": 0.5926058888435364, "rewards/length2tails_reward/mean": 0.7800338864326477, "rewards/length2tails_reward/std": 0.26246973872184753, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.109048366546631, "rewards/thermo_reward/std": 1.3039993047714233, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.09561163000762463, "epoch": 1.786, "frac_reward_zero_std": 0.0, "grad_norm": 0.10369518399238586, "learning_rate": 1.4716087617731242e-06, "loss": -0.0053, "num_tokens": 7772646.0, "reward": 13.807502746582031, "reward_std": 0.530958354473114, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7768849730491638, "rewards/length2tails_reward/std": 0.2708515226840973, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.6875, "completions/mean_terminated_length": 271.6875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.09534350223839283, "epoch": 1.788, "frac_reward_zero_std": 0.0, "grad_norm": 0.08410871028900146, "learning_rate": 1.4704776473307406e-06, "loss": -0.0006, "num_tokens": 7781372.0, "reward": 13.64209270477295, "reward_std": 0.6280385255813599, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7182252407073975, "rewards/length2tails_reward/std": 0.36311620473861694, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.50996470451355, "rewards/thermo_reward/std": 0.5615194439888, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.0, "completions/mean_terminated_length": 272.0, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09412556514143944, "epoch": 1.79, "frac_reward_zero_std": 0.0, "grad_norm": 0.08788694441318512, "learning_rate": 1.4693457593060793e-06, "loss": -0.0019, "num_tokens": 7790108.0, "reward": 13.598801612854004, "reward_std": 0.6531413793563843, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.6841665506362915, "rewards/length2tails_reward/std": 0.33494260907173157, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.470078706741333, "rewards/thermo_reward/std": 0.5830413699150085, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.5, "completions/mean_terminated_length": 270.5, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09312474494799972, "epoch": 1.792, "frac_reward_zero_std": 0.0, "grad_norm": 0.07696857303380966, "learning_rate": 1.4682130995602458e-06, "loss": -0.0045, "num_tokens": 7798796.0, "reward": 12.536702156066895, "reward_std": 3.5887157917022705, "rewards/fitness_reward/mean": 7.051756381988525, "rewards/fitness_reward/std": 1.7503925561904907, "rewards/kidney_reward/mean": 2.402106523513794, "rewards/kidney_reward/std": 0.7840635180473328, "rewards/length2tails_reward/mean": 0.6466629505157471, "rewards/length2tails_reward/std": 0.30107077956199646, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.918172836303711, "rewards/thermo_reward/std": 1.7599866390228271, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.78125, "completions/mean_terminated_length": 272.78125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.10589804220944643, "epoch": 1.794, "frac_reward_zero_std": 0.0, "grad_norm": 0.10630529373884201, "learning_rate": 1.4670796699556143e-06, "loss": -0.0026, "num_tokens": 7807557.0, "reward": 11.9832124710083, "reward_std": 5.638438701629639, "rewards/fitness_reward/mean": 6.682827472686768, "rewards/fitness_reward/std": 2.6741294860839844, "rewards/kidney_reward/mean": 2.3010666370391846, "rewards/kidney_reward/std": 1.1823711395263672, "rewards/length2tails_reward/mean": 0.7523695230484009, "rewards/length2tails_reward/std": 0.27255868911743164, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8240814208984375, "rewards/thermo_reward/std": 2.0244994163513184, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0914301173761487, "epoch": 1.796, "frac_reward_zero_std": 0.0, "grad_norm": 0.07685575634241104, "learning_rate": 1.4659454723558246e-06, "loss": -0.0029, "num_tokens": 7816305.0, "reward": 13.886134147644043, "reward_std": 0.4406987428665161, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7654820680618286, "rewards/length2tails_reward/std": 0.29283666610717773, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.4375, "completions/mean_terminated_length": 273.4375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09987590741366148, "epoch": 1.798, "frac_reward_zero_std": 0.0, "grad_norm": 0.0851331502199173, "learning_rate": 1.4648105086257807e-06, "loss": -0.0058, "num_tokens": 7825087.0, "reward": 13.067283630371094, "reward_std": 4.014723777770996, "rewards/fitness_reward/mean": 7.032796859741211, "rewards/fitness_reward/std": 1.8576456308364868, "rewards/kidney_reward/mean": 2.4413154125213623, "rewards/kidney_reward/std": 0.9010650515556335, "rewards/length2tails_reward/mean": 0.8494006395339966, "rewards/length2tails_reward/std": 0.22823107242584229, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4082322120666504, "rewards/thermo_reward/std": 1.3034515380859375, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.46875, "completions/mean_terminated_length": 272.46875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09739398490637541, "epoch": 1.8, "frac_reward_zero_std": 0.0, "grad_norm": 0.13894827663898468, "learning_rate": 1.4636747806316444e-06, "loss": -0.0016, "num_tokens": 7833838.0, "reward": 13.313909530639648, "reward_std": 1.5198371410369873, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.7758152484893799, "rewards/length2tails_reward/std": 0.29189664125442505, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.230740547180176, "rewards/thermo_reward/std": 1.29622483253479, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.09375, "completions/mean_terminated_length": 272.09375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09956398233771324, "epoch": 1.802, "frac_reward_zero_std": 0.0, "grad_norm": 0.12740930914878845, "learning_rate": 1.4625382902408354e-06, "loss": 0.0056, "num_tokens": 7842577.0, "reward": 13.187545776367188, "reward_std": 3.881068229675293, "rewards/fitness_reward/mean": 7.049934387207031, "rewards/fitness_reward/std": 1.7607014179229736, "rewards/kidney_reward/mean": 2.4841480255126953, "rewards/kidney_reward/std": 0.8051493167877197, "rewards/length2tails_reward/mean": 0.7785500884056091, "rewards/length2tails_reward/std": 0.22710512578487396, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.475609302520752, "rewards/thermo_reward/std": 1.3542872667312622, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.1875, "completions/mean_terminated_length": 273.1875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09471298195421696, "epoch": 1.804, "frac_reward_zero_std": 0.0, "grad_norm": 0.12734858691692352, "learning_rate": 1.4614010393220262e-06, "loss": -0.0005, "num_tokens": 7851351.0, "reward": 13.22269058227539, "reward_std": 1.6711552143096924, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8141377568244934, "rewards/length2tails_reward/std": 0.25066789984703064, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1658401489257812, "rewards/thermo_reward/std": 1.358097791671753, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.1875, "completions/mean_terminated_length": 271.1875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0919131813570857, "epoch": 1.806, "frac_reward_zero_std": 0.0, "grad_norm": 0.08095831423997879, "learning_rate": 1.4602630297451407e-06, "loss": -0.0013, "num_tokens": 7860061.0, "reward": 13.463017463684082, "reward_std": 0.956395149230957, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.6921268105506897, "rewards/length2tails_reward/std": 0.3316554129123688, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3334994316101074, "rewards/thermo_reward/std": 0.9141343832015991, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.9375, "completions/mean_terminated_length": 271.9375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09086079522967339, "epoch": 1.808, "frac_reward_zero_std": 0.0, "grad_norm": 0.35862961411476135, "learning_rate": 1.4591242633813494e-06, "loss": -0.0045, "num_tokens": 7868795.0, "reward": 13.240551948547363, "reward_std": 2.3463644981384277, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.650642991065979, "rewards/kidney_reward/mean": 2.4671764373779297, "rewards/kidney_reward/std": 0.6201311349868774, "rewards/length2tails_reward/mean": 0.7406184673309326, "rewards/length2tails_reward/std": 0.30301016569137573, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.353147029876709, "rewards/thermo_reward/std": 1.2772053480148315, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 270.1875, "completions/mean_terminated_length": 270.1875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09792979806661606, "epoch": 1.81, "frac_reward_zero_std": 0.0, "grad_norm": 0.08443191647529602, "learning_rate": 1.4579847421030676e-06, "loss": -0.0028, "num_tokens": 7877473.0, "reward": 13.049362182617188, "reward_std": 2.3406870365142822, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.480879306793213, "rewards/kidney_reward/std": 0.6816642880439758, "rewards/length2tails_reward/mean": 0.6595234274864197, "rewards/length2tails_reward/std": 0.30757832527160645, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0413448810577393, "rewards/thermo_reward/std": 1.79043447971344, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 271.5, "completions/mean_terminated_length": 271.5, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10100154858082533, "epoch": 1.812, "frac_reward_zero_std": 0.0, "grad_norm": 0.06824267655611038, "learning_rate": 1.4568444677839515e-06, "loss": -0.0032, "num_tokens": 7886193.0, "reward": 13.647187232971191, "reward_std": 0.9054825305938721, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7222320437431335, "rewards/length2tails_reward/std": 0.28808388113975525, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4872992038726807, "rewards/thermo_reward/std": 0.9007164835929871, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.1875, "completions/mean_terminated_length": 273.1875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09898331295698881, "epoch": 1.814, "frac_reward_zero_std": 0.0, "grad_norm": 0.10882727801799774, "learning_rate": 1.4557034422988958e-06, "loss": 0.0004, "num_tokens": 7894967.0, "reward": 11.802138328552246, "reward_std": 5.2616496086120605, "rewards/fitness_reward/mean": 6.572333335876465, "rewards/fitness_reward/std": 2.8820300102233887, "rewards/kidney_reward/mean": 2.1918716430664062, "rewards/kidney_reward/std": 1.2480754852294922, "rewards/length2tails_reward/mean": 0.7903756499290466, "rewards/length2tails_reward/std": 0.3119876980781555, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.858895778656006, "rewards/thermo_reward/std": 1.8779559135437012, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.875, "completions/mean_terminated_length": 271.875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09829057846218348, "epoch": 1.8159999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.1091950312256813, "learning_rate": 1.4545616675240307e-06, "loss": 0.0031, "num_tokens": 7903699.0, "reward": 13.163553237915039, "reward_std": 2.364424705505371, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.49746036529541, "rewards/kidney_reward/std": 0.5906985402107239, "rewards/length2tails_reward/mean": 0.7393020987510681, "rewards/length2tails_reward/std": 0.2952393889427185, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.188486337661743, "rewards/thermo_reward/std": 1.5971673727035522, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.46875, "completions/mean_terminated_length": 270.46875, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 0.09047531802207232, "epoch": 1.818, "frac_reward_zero_std": 0.0, "grad_norm": 0.15408550202846527, "learning_rate": 1.4534191453367172e-06, "loss": 0.0031, "num_tokens": 7912386.0, "reward": 13.424781799316406, "reward_std": 1.3201360702514648, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.7214326858520508, "rewards/length2tails_reward/std": 0.29441601037979126, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.347050905227661, "rewards/thermo_reward/std": 1.1589794158935547, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.0625, "completions/mean_terminated_length": 273.0625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10163731779903173, "epoch": 1.8199999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.3144538104534149, "learning_rate": 1.4522758776155464e-06, "loss": 0.0021, "num_tokens": 7921156.0, "reward": 13.272607803344727, "reward_std": 1.7322739362716675, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.365363597869873, "rewards/kidney_reward/std": 0.7302173376083374, "rewards/length2tails_reward/mean": 0.7840762734413147, "rewards/length2tails_reward/std": 0.2829437255859375, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.36765193939209, "rewards/thermo_reward/std": 1.1156370639801025, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 0.0990389846265316, "epoch": 1.822, "frac_reward_zero_std": 0.0, "grad_norm": 0.11379135400056839, "learning_rate": 1.4511318662403345e-06, "loss": -0.0074, "num_tokens": 7929884.0, "reward": 12.687387466430664, "reward_std": 3.787317991256714, "rewards/fitness_reward/mean": 6.938035011291504, "rewards/fitness_reward/std": 1.840762972831726, "rewards/kidney_reward/mean": 2.409886360168457, "rewards/kidney_reward/std": 0.755768358707428, "rewards/length2tails_reward/mean": 0.795867383480072, "rewards/length2tails_reward/std": 0.2925042510032654, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.159878730773926, "rewards/thermo_reward/std": 1.4481010437011719, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.8125, "completions/mean_terminated_length": 271.8125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09324057400226593, "epoch": 1.8239999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.07792110741138458, "learning_rate": 1.449987113092121e-06, "loss": -0.0054, "num_tokens": 7938614.0, "reward": 13.062980651855469, "reward_std": 2.5903031826019287, "rewards/fitness_reward/mean": 6.999163627624512, "rewards/fitness_reward/std": 2.047900915145874, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.7137687802314758, "rewards/length2tails_reward/std": 0.3201422393321991, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3480372428894043, "rewards/thermo_reward/std": 1.0235670804977417, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.125, "completions/mean_terminated_length": 270.125, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.09945305716246367, "epoch": 1.826, "frac_reward_zero_std": 0.0, "grad_norm": 0.17810320854187012, "learning_rate": 1.448841620053165e-06, "loss": -0.0045, "num_tokens": 7947290.0, "reward": 12.08353328704834, "reward_std": 5.4303483963012695, "rewards/fitness_reward/mean": 6.957911491394043, "rewards/fitness_reward/std": 2.2812585830688477, "rewards/kidney_reward/mean": 2.2508323192596436, "rewards/kidney_reward/std": 1.38154137134552, "rewards/length2tails_reward/mean": 0.7138035893440247, "rewards/length2tails_reward/std": 0.3589658737182617, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.70340895652771, "rewards/thermo_reward/std": 2.210334300994873, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.21875, "completions/mean_terminated_length": 272.21875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10191969107836485, "epoch": 1.8279999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.06049925461411476, "learning_rate": 1.4476953890069415e-06, "loss": -0.0067, "num_tokens": 7956033.0, "reward": 13.038139343261719, "reward_std": 2.406184196472168, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4171836376190186, "rewards/kidney_reward/std": 0.6189330220222473, "rewards/length2tails_reward/mean": 0.7486883401870728, "rewards/length2tails_reward/std": 0.3089596927165985, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.084902763366699, "rewards/thermo_reward/std": 1.8648375272750854, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.84375, "completions/mean_terminated_length": 271.84375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09475038014352322, "epoch": 1.83, "frac_reward_zero_std": 0.0, "grad_norm": 0.08417715132236481, "learning_rate": 1.4465484218381398e-06, "loss": -0.0048, "num_tokens": 7964764.0, "reward": 12.575175285339355, "reward_std": 3.092132806777954, "rewards/fitness_reward/mean": 7.073638916015625, "rewards/fitness_reward/std": 0.8242490291595459, "rewards/kidney_reward/mean": 2.2160050868988037, "rewards/kidney_reward/std": 1.0591219663619995, "rewards/length2tails_reward/mean": 0.7387404441833496, "rewards/length2tails_reward/std": 0.2980632781982422, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1116576194763184, "rewards/thermo_reward/std": 1.4491232633590698, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.46875, "completions/mean_terminated_length": 273.46875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09082639962434769, "epoch": 1.8319999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.07699467241764069, "learning_rate": 1.445400720432659e-06, "loss": -0.0081, "num_tokens": 7973547.0, "reward": 12.984945297241211, "reward_std": 2.1997668743133545, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.471334457397461, "rewards/kidney_reward/std": 0.5981243252754211, "rewards/length2tails_reward/mean": 0.7991466522216797, "rewards/length2tails_reward/std": 0.30184128880500793, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9725115299224854, "rewards/thermo_reward/std": 1.765310287475586, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.46875, "completions/mean_terminated_length": 272.46875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10166769474744797, "epoch": 1.834, "frac_reward_zero_std": 0.0, "grad_norm": 0.10199066251516342, "learning_rate": 1.444252286677606e-06, "loss": -0.0009, "num_tokens": 7982298.0, "reward": 13.361869812011719, "reward_std": 1.6339774131774902, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.7698524594306946, "rewards/length2tails_reward/std": 0.2833806872367859, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2792983055114746, "rewards/thermo_reward/std": 1.4069561958312988, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.59375, "completions/mean_terminated_length": 272.59375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0994638055562973, "epoch": 1.8359999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.09854952245950699, "learning_rate": 1.4431031224612913e-06, "loss": -0.0054, "num_tokens": 7991053.0, "reward": 13.195050239562988, "reward_std": 2.268061876296997, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.512495756149292, "rewards/kidney_reward/std": 0.6447898745536804, "rewards/length2tails_reward/mean": 0.7886925935745239, "rewards/length2tails_reward/std": 0.23495440185070038, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.200009346008301, "rewards/thermo_reward/std": 1.5066677331924438, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.84375, "completions/mean_terminated_length": 273.84375, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.09607025887817144, "epoch": 1.838, "frac_reward_zero_std": 0.0, "grad_norm": 0.0761309266090393, "learning_rate": 1.4419532296732268e-06, "loss": -0.0036, "num_tokens": 7999848.0, "reward": 13.07663345336914, "reward_std": 2.1593430042266846, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.437307357788086, "rewards/kidney_reward/std": 0.6450527310371399, "rewards/length2tails_reward/mean": 0.8511402010917664, "rewards/length2tails_reward/std": 0.1919945776462555, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1505355834960938, "rewards/thermo_reward/std": 1.3073687553405762, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.84375, "completions/mean_terminated_length": 270.84375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08434112183749676, "epoch": 1.8399999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.112014539539814, "learning_rate": 1.4408026102041222e-06, "loss": -0.0042, "num_tokens": 8008547.0, "reward": 13.352241516113281, "reward_std": 1.2534050941467285, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.5977305173873901, "rewards/length2tails_reward/std": 0.40180954337120056, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.204803228378296, "rewards/thermo_reward/std": 1.2496801614761353, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.4375, "completions/mean_terminated_length": 272.4375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0949926944449544, "epoch": 1.842, "frac_reward_zero_std": 0.0, "grad_norm": 0.07384954392910004, "learning_rate": 1.4396512659458822e-06, "loss": -0.0045, "num_tokens": 8017297.0, "reward": 13.42481517791748, "reward_std": 1.2391330003738403, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7046566009521484, "rewards/length2tails_reward/std": 0.3201577961444855, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.294044017791748, "rewards/thermo_reward/std": 1.095947027206421, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.125, "completions/mean_terminated_length": 273.125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10082920826971531, "epoch": 1.8439999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.06823424249887466, "learning_rate": 1.4384991987916028e-06, "loss": -0.0062, "num_tokens": 8026069.0, "reward": 12.671717643737793, "reward_std": 4.562189102172852, "rewards/fitness_reward/mean": 6.998061180114746, "rewards/fitness_reward/std": 2.054140329360962, "rewards/kidney_reward/mean": 2.391406774520874, "rewards/kidney_reward/std": 1.0328963994979858, "rewards/length2tails_reward/mean": 0.8234930038452148, "rewards/length2tails_reward/std": 0.24498499929904938, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.099900722503662, "rewards/thermo_reward/std": 1.7860603332519531, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.40625, "completions/mean_terminated_length": 272.40625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10238801874220371, "epoch": 1.846, "frac_reward_zero_std": 0.0, "grad_norm": 0.11172787845134735, "learning_rate": 1.4373464106355695e-06, "loss": 0.0017, "num_tokens": 8034818.0, "reward": 12.963394165039062, "reward_std": 2.5045695304870605, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.3441781997680664, "rewards/kidney_reward/std": 0.895361065864563, "rewards/length2tails_reward/mean": 0.7489575147628784, "rewards/length2tails_reward/std": 0.32563844323158264, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1406450271606445, "rewards/thermo_reward/std": 1.433326244354248, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 271.125, "completions/mean_terminated_length": 271.125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09571900684386492, "epoch": 1.8479999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.12115728855133057, "learning_rate": 1.4361929033732526e-06, "loss": 0.0029, "num_tokens": 8043526.0, "reward": 13.049151420593262, "reward_std": 3.674513339996338, "rewards/fitness_reward/mean": 7.050969123840332, "rewards/fitness_reward/std": 1.7548457384109497, "rewards/kidney_reward/mean": 2.4927873611450195, "rewards/kidney_reward/std": 0.7562783360481262, "rewards/length2tails_reward/mean": 0.6843692064285278, "rewards/length2tails_reward/std": 0.3321457505226135, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3369572162628174, "rewards/thermo_reward/std": 1.3621774911880493, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.6875, "completions/mean_terminated_length": 272.6875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10554734244942665, "epoch": 1.85, "frac_reward_zero_std": 0.0, "grad_norm": 0.09987124055624008, "learning_rate": 1.4350386789013044e-06, "loss": -0.0082, "num_tokens": 8052284.0, "reward": 12.666133880615234, "reward_std": 4.327279090881348, "rewards/fitness_reward/mean": 6.991957187652588, "rewards/fitness_reward/std": 1.7827560901641846, "rewards/kidney_reward/mean": 2.3112308979034424, "rewards/kidney_reward/std": 1.0905444622039795, "rewards/length2tails_reward/mean": 0.7796467542648315, "rewards/length2tails_reward/std": 0.2763543128967285, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.184980869293213, "rewards/thermo_reward/std": 1.8170864582061768, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.5, "completions/mean_terminated_length": 272.5, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09814912639558315, "epoch": 1.8519999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.12721046805381775, "learning_rate": 1.433883739117558e-06, "loss": -0.0046, "num_tokens": 8061036.0, "reward": 13.344095230102539, "reward_std": 1.9957871437072754, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.495349168777466, "rewards/kidney_reward/std": 0.6022319197654724, "rewards/length2tails_reward/mean": 0.7750345468521118, "rewards/length2tails_reward/std": 0.2605418860912323, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3675663471221924, "rewards/thermo_reward/std": 1.116063117980957, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.65625, "completions/mean_terminated_length": 273.65625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09740834962576628, "epoch": 1.854, "frac_reward_zero_std": 0.0, "grad_norm": 0.353313148021698, "learning_rate": 1.432728085921021e-06, "loss": -0.0035, "num_tokens": 8069825.0, "reward": 12.018280982971191, "reward_std": 5.810842514038086, "rewards/fitness_reward/mean": 6.850230693817139, "rewards/fitness_reward/std": 2.312100648880005, "rewards/kidney_reward/mean": 2.152690887451172, "rewards/kidney_reward/std": 1.591987133026123, "rewards/length2tails_reward/mean": 0.8441159129142761, "rewards/length2tails_reward/std": 0.19898547232151031, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8309483528137207, "rewards/thermo_reward/std": 2.15840482711792, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.90625, "completions/mean_terminated_length": 271.90625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09615112002938986, "epoch": 1.8559999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.11188706755638123, "learning_rate": 1.4315717212118751e-06, "loss": 0.0013, "num_tokens": 8078558.0, "reward": 13.501045227050781, "reward_std": 1.2505115270614624, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7311096787452698, "rewards/length2tails_reward/std": 0.2580016255378723, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3676280975341797, "rewards/thermo_reward/std": 1.1157556772232056, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.21875, "completions/mean_terminated_length": 273.21875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09848130866885185, "epoch": 1.858, "frac_reward_zero_std": 0.0, "grad_norm": 0.1136355847120285, "learning_rate": 1.4304146468914713e-06, "loss": -0.0023, "num_tokens": 8087333.0, "reward": 12.441036224365234, "reward_std": 5.264050483703613, "rewards/fitness_reward/mean": 6.976176738739014, "rewards/fitness_reward/std": 2.177935838699341, "rewards/kidney_reward/mean": 2.253908157348633, "rewards/kidney_reward/std": 1.4781852960586548, "rewards/length2tails_reward/mean": 0.8325543403625488, "rewards/length2tails_reward/std": 0.1891918182373047, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0276966094970703, "rewards/thermo_reward/std": 1.9368516206741333, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.4375, "completions/mean_terminated_length": 271.4375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08642545621842146, "epoch": 1.8599999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.050249114632606506, "learning_rate": 1.4292568648623274e-06, "loss": -0.0048, "num_tokens": 8096051.0, "reward": 13.441530227661133, "reward_std": 2.050464391708374, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4745676517486572, "rewards/kidney_reward/std": 0.7164823412895203, "rewards/length2tails_reward/mean": 0.6919739246368408, "rewards/length2tails_reward/std": 0.32226404547691345, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.436579704284668, "rewards/thermo_reward/std": 1.415424108505249, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.9375, "completions/mean_terminated_length": 272.9375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09469401743263006, "epoch": 1.862, "frac_reward_zero_std": 0.0, "grad_norm": 0.07521870732307434, "learning_rate": 1.4280983770281256e-06, "loss": -0.0055, "num_tokens": 8104817.0, "reward": 13.544778823852539, "reward_std": 1.2863622903823853, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8186159729957581, "rewards/length2tails_reward/std": 0.23112718760967255, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3752520084381104, "rewards/thermo_reward/std": 1.2737072706222534, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.59375, "completions/mean_terminated_length": 272.59375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.10019571334123611, "epoch": 1.8639999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.10404334962368011, "learning_rate": 1.4269391852937074e-06, "loss": 0.0045, "num_tokens": 8113572.0, "reward": 13.64660358428955, "reward_std": 0.6189796328544617, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7633348703384399, "rewards/length2tails_reward/std": 0.3237886428833008, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.50996470451355, "rewards/thermo_reward/std": 0.5615194439888, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.9375, "completions/mean_terminated_length": 271.9375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10216997563838959, "epoch": 1.866, "frac_reward_zero_std": 0.0, "grad_norm": 0.10921880602836609, "learning_rate": 1.4257792915650725e-06, "loss": -0.0061, "num_tokens": 8122306.0, "reward": 13.247479438781738, "reward_std": 2.2764835357666016, "rewards/fitness_reward/mean": 7.0529656410217285, "rewards/fitness_reward/std": 1.7435520887374878, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7549443244934082, "rewards/length2tails_reward/std": 0.2997691333293915, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4472579956054688, "rewards/thermo_reward/std": 0.9139991998672485, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.3125, "completions/mean_terminated_length": 271.3125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09624828863888979, "epoch": 1.8679999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.057099346071481705, "learning_rate": 1.4246186977493752e-06, "loss": -0.005, "num_tokens": 8131020.0, "reward": 13.203994750976562, "reward_std": 2.49579119682312, "rewards/fitness_reward/mean": 7.052845478057861, "rewards/fitness_reward/std": 1.7442326545715332, "rewards/kidney_reward/mean": 2.5115790367126465, "rewards/kidney_reward/std": 0.5140424966812134, "rewards/length2tails_reward/mean": 0.6949120759963989, "rewards/length2tails_reward/std": 0.3189202845096588, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.470078706741333, "rewards/thermo_reward/std": 0.5830413699150085, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08982496801763773, "epoch": 1.87, "frac_reward_zero_std": 0.0, "grad_norm": 0.06818225234746933, "learning_rate": 1.42345740575492e-06, "loss": 0.0002, "num_tokens": 8139764.0, "reward": 13.297567367553711, "reward_std": 1.665936827659607, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7706277370452881, "rewards/length2tails_reward/std": 0.2755311131477356, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1601996421813965, "rewards/thermo_reward/std": 1.5545653104782104, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.0625, "completions/mean_terminated_length": 273.0625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09777613170444965, "epoch": 1.8719999999999999, "frac_reward_zero_std": 0.0, "grad_norm": 0.18864397704601288, "learning_rate": 1.4222954174911598e-06, "loss": -0.0027, "num_tokens": 8148534.0, "reward": 12.82504653930664, "reward_std": 2.758099317550659, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.3964080810546875, "rewards/kidney_reward/std": 0.9116268754005432, "rewards/length2tails_reward/mean": 0.8018828630447388, "rewards/length2tails_reward/std": 0.2824718952178955, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8872647285461426, "rewards/thermo_reward/std": 1.9564694166183472, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.3125, "completions/mean_terminated_length": 272.3125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09598909690976143, "epoch": 1.874, "frac_reward_zero_std": 0.0, "grad_norm": 0.09967732429504395, "learning_rate": 1.4211327348686924e-06, "loss": 0.0021, "num_tokens": 8157280.0, "reward": 13.60390853881836, "reward_std": 0.6416917443275452, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7352484464645386, "rewards/length2tails_reward/std": 0.2844075858592987, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.470078706741333, "rewards/thermo_reward/std": 0.5830413699150085, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.0, "completions/mean_terminated_length": 273.0, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09456842020154, "epoch": 1.876, "frac_reward_zero_std": 0.0, "grad_norm": 0.0885779857635498, "learning_rate": 1.4199693597992572e-06, "loss": -0.0056, "num_tokens": 8166048.0, "reward": 13.347528457641602, "reward_std": 1.448620319366455, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.7930765151977539, "rewards/length2tails_reward/std": 0.2606636583805084, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.262633800506592, "rewards/thermo_reward/std": 1.2503079175949097, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.78125, "completions/mean_terminated_length": 272.78125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09383744280785322, "epoch": 1.8780000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.12151187658309937, "learning_rate": 1.4188052941957324e-06, "loss": 0.0018, "num_tokens": 8174809.0, "reward": 13.168033599853516, "reward_std": 2.0894129276275635, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.478447914123535, "rewards/kidney_reward/std": 0.6950652003288269, "rewards/length2tails_reward/mean": 0.8190457820892334, "rewards/length2tails_reward/std": 0.23952603340148926, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.146496534347534, "rewards/thermo_reward/std": 1.5423566102981567, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.59375, "completions/mean_terminated_length": 271.59375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09733179584145546, "epoch": 1.88, "frac_reward_zero_std": 0.0, "grad_norm": 0.1220579445362091, "learning_rate": 1.417640539972131e-06, "loss": -0.0002, "num_tokens": 8183532.0, "reward": 13.365985870361328, "reward_std": 1.429917812347412, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7476000785827637, "rewards/length2tails_reward/std": 0.23121827840805054, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.230919599533081, "rewards/thermo_reward/std": 1.3122392892837524, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.15625, "completions/mean_terminated_length": 273.15625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09898821823298931, "epoch": 1.8820000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.1083887368440628, "learning_rate": 1.416475099043599e-06, "loss": -0.0014, "num_tokens": 8192305.0, "reward": 13.080820083618164, "reward_std": 2.0238304138183594, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.504392147064209, "rewards/kidney_reward/std": 0.5529485940933228, "rewards/length2tails_reward/mean": 0.7721121311187744, "rewards/length2tails_reward/std": 0.30431506037712097, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0380313396453857, "rewards/thermo_reward/std": 1.6276273727416992, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.875, "completions/mean_terminated_length": 271.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08173881471157074, "epoch": 1.884, "frac_reward_zero_std": 0.0, "grad_norm": 0.07843513041734695, "learning_rate": 1.4153089733264114e-06, "loss": -0.0037, "num_tokens": 8201037.0, "reward": 13.109209060668945, "reward_std": 2.7859926223754883, "rewards/fitness_reward/mean": 6.999617576599121, "rewards/fitness_reward/std": 2.045334815979004, "rewards/kidney_reward/mean": 2.511237621307373, "rewards/kidney_reward/std": 0.5158844590187073, "rewards/length2tails_reward/mean": 0.6816088557243347, "rewards/length2tails_reward/std": 0.3428049087524414, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4301929473876953, "rewards/thermo_reward/std": 0.6010707020759583, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.65625, "completions/mean_terminated_length": 271.65625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09550878871232271, "epoch": 1.8860000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.12618957459926605, "learning_rate": 1.4141421647379681e-06, "loss": 0.0043, "num_tokens": 8209762.0, "reward": 13.667505264282227, "reward_std": 0.5561919808387756, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.6987535953521729, "rewards/length2tails_reward/std": 0.3205377459526062, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.50996470451355, "rewards/thermo_reward/std": 0.5615194439888, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.1875, "completions/mean_terminated_length": 273.1875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09802985563874245, "epoch": 1.888, "frac_reward_zero_std": 0.0, "grad_norm": 0.11916353553533554, "learning_rate": 1.4129746751967933e-06, "loss": 0.0004, "num_tokens": 8218536.0, "reward": 13.113667488098145, "reward_std": 2.162151336669922, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.3640737533569336, "rewards/kidney_reward/std": 0.8366342782974243, "rewards/length2tails_reward/mean": 0.7499675154685974, "rewards/length2tails_reward/std": 0.3374538719654083, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2134127616882324, "rewards/thermo_reward/std": 1.3592503070831299, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.10340733919292688, "epoch": 1.8900000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.08188685774803162, "learning_rate": 1.4118065066225301e-06, "loss": -0.0066, "num_tokens": 8227284.0, "reward": 13.1350736618042, "reward_std": 2.407517910003662, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.47782564163208, "rewards/kidney_reward/std": 0.6984977126121521, "rewards/length2tails_reward/mean": 0.7967540621757507, "rewards/length2tails_reward/std": 0.2664756178855896, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1738967895507812, "rewards/thermo_reward/std": 1.5063883066177368, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.75, "completions/mean_terminated_length": 273.75, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.0962830288335681, "epoch": 1.892, "frac_reward_zero_std": 0.0, "grad_norm": 0.08123506605625153, "learning_rate": 1.410637660935938e-06, "loss": -0.001, "num_tokens": 8236076.0, "reward": 13.664963722229004, "reward_std": 1.052045464515686, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8287937641143799, "rewards/length2tails_reward/std": 0.20952561497688293, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.494419574737549, "rewards/thermo_reward/std": 1.0522606372833252, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.34375, "completions/mean_terminated_length": 273.34375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10053523350507021, "epoch": 1.8940000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.15598320960998535, "learning_rate": 1.4094681400588907e-06, "loss": 0.0008, "num_tokens": 8244855.0, "reward": 13.835688591003418, "reward_std": 0.4316123425960541, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7851564884185791, "rewards/length2tails_reward/std": 0.3080328702926636, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.90625, "completions/mean_terminated_length": 271.90625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10091596748679876, "epoch": 1.896, "frac_reward_zero_std": 0.0, "grad_norm": 0.13453346490859985, "learning_rate": 1.4082979459143704e-06, "loss": 0.0008, "num_tokens": 8253588.0, "reward": 12.257545471191406, "reward_std": 5.135241508483887, "rewards/fitness_reward/mean": 6.992251396179199, "rewards/fitness_reward/std": 2.087003707885742, "rewards/kidney_reward/mean": 2.2595787048339844, "rewards/kidney_reward/std": 1.3908474445343018, "rewards/length2tails_reward/mean": 0.7675119042396545, "rewards/length2tails_reward/std": 0.26772943139076233, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8289639949798584, "rewards/thermo_reward/std": 2.066842555999756, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.0, "completions/mean_terminated_length": 272.0, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09524937998503447, "epoch": 1.8980000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.07262866199016571, "learning_rate": 1.407127080426468e-06, "loss": -0.0065, "num_tokens": 8262324.0, "reward": 12.8803071975708, "reward_std": 2.1984105110168457, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.5063095092773438, "rewards/kidney_reward/std": 0.5425441265106201, "rewards/length2tails_reward/mean": 0.7491485476493835, "rewards/length2tails_reward/std": 0.25587987899780273, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.895406723022461, "rewards/thermo_reward/std": 1.6321258544921875, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.75, "completions/mean_terminated_length": 272.75, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.0994444964453578, "epoch": 1.9, "frac_reward_zero_std": 0.0, "grad_norm": 0.0991891473531723, "learning_rate": 1.4059555455203776e-06, "loss": 0.0012, "num_tokens": 8271084.0, "reward": 12.7920560836792, "reward_std": 2.592541456222534, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.4269556999206543, "rewards/kidney_reward/std": 0.6990845203399658, "rewards/length2tails_reward/mean": 0.7857171893119812, "rewards/length2tails_reward/std": 0.24670259654521942, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8828530311584473, "rewards/thermo_reward/std": 1.8071008920669556, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.78125, "completions/mean_terminated_length": 272.78125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.09295944962650537, "epoch": 1.9020000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.07763893902301788, "learning_rate": 1.4047833431223936e-06, "loss": -0.007, "num_tokens": 8279845.0, "reward": 12.431122779846191, "reward_std": 4.762660026550293, "rewards/fitness_reward/mean": 6.984732151031494, "rewards/fitness_reward/std": 2.1295392513275146, "rewards/kidney_reward/mean": 2.347259521484375, "rewards/kidney_reward/std": 1.1302677392959595, "rewards/length2tails_reward/mean": 0.7435581684112549, "rewards/length2tails_reward/std": 0.3555532395839691, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9247758388519287, "rewards/thermo_reward/std": 1.875200629234314, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.375, "completions/mean_terminated_length": 273.375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.0887742293998599, "epoch": 1.904, "frac_reward_zero_std": 0.0, "grad_norm": 0.08556952327489853, "learning_rate": 1.403610475159909e-06, "loss": -0.0001, "num_tokens": 8288625.0, "reward": 13.757013320922852, "reward_std": 0.510677695274353, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7961187362670898, "rewards/length2tails_reward/std": 0.2826400697231293, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.46875, "completions/mean_terminated_length": 272.46875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09668416064232588, "epoch": 1.9060000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.0863301232457161, "learning_rate": 1.40243694356141e-06, "loss": -0.0057, "num_tokens": 8297376.0, "reward": 13.427196502685547, "reward_std": 1.445143461227417, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5376205444335938, "rewards/kidney_reward/std": 0.5026634931564331, "rewards/length2tails_reward/mean": 0.733130156993866, "rewards/length2tails_reward/std": 0.3547823131084442, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3550782203674316, "rewards/thermo_reward/std": 0.9904274940490723, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.65625, "completions/mean_terminated_length": 272.65625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09867602214217186, "epoch": 1.908, "frac_reward_zero_std": 0.0, "grad_norm": 0.07859351485967636, "learning_rate": 1.4012627502564742e-06, "loss": -0.0033, "num_tokens": 8306133.0, "reward": 13.1310453414917, "reward_std": 1.9072624444961548, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.509688138961792, "rewards/kidney_reward/std": 0.5242533683776855, "rewards/length2tails_reward/mean": 0.8021127581596375, "rewards/length2tails_reward/std": 0.24368120729923248, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0799612998962402, "rewards/thermo_reward/std": 1.5795716047286987, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.53125, "completions/mean_terminated_length": 271.53125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10116610862314701, "epoch": 1.9100000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.05824463441967964, "learning_rate": 1.400087897175768e-06, "loss": -0.0036, "num_tokens": 8314854.0, "reward": 13.070198059082031, "reward_std": 3.001626491546631, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.511730670928955, "rewards/kidney_reward/std": 0.5132253766059875, "rewards/length2tails_reward/mean": 0.6872705817222595, "rewards/length2tails_reward/std": 0.3044886887073517, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.336686611175537, "rewards/thermo_reward/std": 0.9002618789672852, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.0625, "completions/mean_terminated_length": 272.0625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08747619763016701, "epoch": 1.912, "frac_reward_zero_std": 0.0, "grad_norm": 0.09268809109926224, "learning_rate": 1.3989123862510416e-06, "loss": -0.0013, "num_tokens": 8323592.0, "reward": 12.777021408081055, "reward_std": 3.3635833263397217, "rewards/fitness_reward/mean": 6.941766738891602, "rewards/fitness_reward/std": 2.06256365776062, "rewards/kidney_reward/mean": 2.44405460357666, "rewards/kidney_reward/std": 0.7441923022270203, "rewards/length2tails_reward/mean": 0.7442810535430908, "rewards/length2tails_reward/std": 0.29096719622612, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.21677303314209, "rewards/thermo_reward/std": 1.5317223072052002, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.21875, "completions/mean_terminated_length": 272.21875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10323672275990248, "epoch": 1.9140000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.09083746373653412, "learning_rate": 1.3977362194151278e-06, "loss": 0.0029, "num_tokens": 8332335.0, "reward": 13.35675048828125, "reward_std": 1.421925663948059, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.7612141370773315, "rewards/length2tails_reward/std": 0.26568615436553955, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2750422954559326, "rewards/thermo_reward/std": 1.289695382118225, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.71875, "completions/mean_terminated_length": 272.71875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.10143384896218777, "epoch": 1.916, "frac_reward_zero_std": 0.0, "grad_norm": 0.07080568373203278, "learning_rate": 1.3965593986019372e-06, "loss": -0.0015, "num_tokens": 8341094.0, "reward": 13.076738357543945, "reward_std": 2.0742430686950684, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.7840505838394165, "rewards/length2tails_reward/std": 0.2203812152147293, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9927470684051514, "rewards/thermo_reward/std": 1.8821724653244019, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.375, "completions/mean_terminated_length": 273.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09049487207084894, "epoch": 1.9180000000000001, "frac_reward_zero_std": 0.0, "grad_norm": 0.08178161829710007, "learning_rate": 1.3953819257464558e-06, "loss": -0.0024, "num_tokens": 8349874.0, "reward": 12.805414199829102, "reward_std": 2.533339738845825, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.4211130142211914, "rewards/kidney_reward/std": 0.5994554758071899, "rewards/length2tails_reward/mean": 0.7625944018363953, "rewards/length2tails_reward/std": 0.31074151396751404, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9043655395507812, "rewards/thermo_reward/std": 1.8625069856643677, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.8125, "completions/mean_terminated_length": 273.8125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09283266495913267, "epoch": 1.92, "frac_reward_zero_std": 0.0, "grad_norm": 0.06749741733074188, "learning_rate": 1.3942038027847423e-06, "loss": -0.005, "num_tokens": 8358668.0, "reward": 12.34156322479248, "reward_std": 5.33221435546875, "rewards/fitness_reward/mean": 6.687288284301758, "rewards/fitness_reward/std": 2.662079334259033, "rewards/kidney_reward/mean": 2.3554515838623047, "rewards/kidney_reward/std": 1.135952353477478, "rewards/length2tails_reward/mean": 0.797242283821106, "rewards/length2tails_reward/std": 0.2937704026699066, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1190998554229736, "rewards/thermo_reward/std": 1.6363537311553955, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.53125, "completions/mean_terminated_length": 271.53125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09066375996917486, "epoch": 1.9220000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.12556754052639008, "learning_rate": 1.3930250316539235e-06, "loss": -0.0062, "num_tokens": 8367389.0, "reward": 12.197015762329102, "reward_std": 4.578823566436768, "rewards/fitness_reward/mean": 6.927325248718262, "rewards/fitness_reward/std": 1.897562026977539, "rewards/kidney_reward/mean": 2.257803201675415, "rewards/kidney_reward/std": 1.0970770120620728, "rewards/length2tails_reward/mean": 0.6851399540901184, "rewards/length2tails_reward/std": 0.34955552220344543, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8433737754821777, "rewards/thermo_reward/std": 2.145561695098877, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.90625, "completions/mean_terminated_length": 272.90625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09863616060465574, "epoch": 1.924, "frac_reward_zero_std": 0.0, "grad_norm": 0.10057486593723297, "learning_rate": 1.3918456142921925e-06, "loss": 0.0015, "num_tokens": 8376154.0, "reward": 13.623000144958496, "reward_std": 0.683121919631958, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8008888959884644, "rewards/length2tails_reward/std": 0.2569819390773773, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.50996470451355, "rewards/thermo_reward/std": 0.5615194439888, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.0, "completions/mean_terminated_length": 273.0, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.10349648538976908, "epoch": 1.9260000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.10809936374425888, "learning_rate": 1.390665552638805e-06, "loss": -0.0031, "num_tokens": 8384922.0, "reward": 13.108209609985352, "reward_std": 2.479018211364746, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.446418523788452, "rewards/kidney_reward/std": 0.8726393580436707, "rewards/length2tails_reward/mean": 0.8275328874588013, "rewards/length2tails_reward/std": 0.23122048377990723, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1178531646728516, "rewards/thermo_reward/std": 1.775908350944519, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.8125, "completions/mean_terminated_length": 270.8125, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.09821292012929916, "epoch": 1.928, "frac_reward_zero_std": 0.0, "grad_norm": 0.2729121446609497, "learning_rate": 1.3894848486340754e-06, "loss": 0.0046, "num_tokens": 8393620.0, "reward": 13.802491188049316, "reward_std": 0.5193327069282532, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7267663478851318, "rewards/length2tails_reward/std": 0.33441072702407837, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.5625, "completions/mean_terminated_length": 272.5625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09538561757653952, "epoch": 1.9300000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.08171401172876358, "learning_rate": 1.388303504219375e-06, "loss": -0.0053, "num_tokens": 8402374.0, "reward": 13.652557373046875, "reward_std": 0.9250484108924866, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7945376634597778, "rewards/length2tails_reward/std": 0.27631810307502747, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4854390621185303, "rewards/thermo_reward/std": 0.9097126126289368, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.59375, "completions/mean_terminated_length": 272.59375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10812081210315228, "epoch": 1.932, "frac_reward_zero_std": 0.0, "grad_norm": 0.11137594282627106, "learning_rate": 1.387121521337128e-06, "loss": 0.0014, "num_tokens": 8411129.0, "reward": 13.556892395019531, "reward_std": 0.6138424277305603, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.789208710193634, "rewards/length2tails_reward/std": 0.28543025255203247, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3903069496154785, "rewards/thermo_reward/std": 0.6159141063690186, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09557634359225631, "epoch": 1.9340000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.05619344115257263, "learning_rate": 1.3859389019308082e-06, "loss": -0.0061, "num_tokens": 8419857.0, "reward": 13.010406494140625, "reward_std": 3.1404671669006348, "rewards/fitness_reward/mean": 6.938035011291504, "rewards/fitness_reward/std": 1.840762972831726, "rewards/kidney_reward/mean": 2.4843714237213135, "rewards/kidney_reward/std": 0.5299732685089111, "rewards/length2tails_reward/mean": 0.7154085636138916, "rewards/length2tails_reward/std": 0.319457471370697, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4164581298828125, "rewards/thermo_reward/std": 0.8831133842468262, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.4375, "completions/mean_terminated_length": 272.4375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09051778819411993, "epoch": 1.936, "frac_reward_zero_std": 0.0, "grad_norm": 0.13804982602596283, "learning_rate": 1.384755647944936e-06, "loss": -0.005, "num_tokens": 8428607.0, "reward": 12.845332145690918, "reward_std": 3.591193199157715, "rewards/fitness_reward/mean": 7.020461559295654, "rewards/fitness_reward/std": 1.9274237155914307, "rewards/kidney_reward/mean": 2.483234405517578, "rewards/kidney_reward/std": 0.5358594655990601, "rewards/length2tails_reward/mean": 0.7685626745223999, "rewards/length2tails_reward/std": 0.31856513023376465, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1647799015045166, "rewards/thermo_reward/std": 1.5537383556365967, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.5, "completions/mean_terminated_length": 273.5, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.10311949905008078, "epoch": 1.938, "frac_reward_zero_std": 0.0, "grad_norm": 0.1776103377342224, "learning_rate": 1.3835717613250753e-06, "loss": -0.0019, "num_tokens": 8437391.0, "reward": 13.27910041809082, "reward_std": 2.4083566665649414, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.422438621520996, "rewards/kidney_reward/std": 0.8103145360946655, "rewards/length2tails_reward/mean": 0.8479813933372498, "rewards/length2tails_reward/std": 0.2027747631072998, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3681869506835938, "rewards/thermo_reward/std": 1.3651937246322632, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.3125, "completions/mean_terminated_length": 272.3125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09956981893628836, "epoch": 1.94, "frac_reward_zero_std": 0.0, "grad_norm": 0.08768046647310257, "learning_rate": 1.38238724401783e-06, "loss": -0.0042, "num_tokens": 8446137.0, "reward": 12.956103324890137, "reward_std": 3.2299368381500244, "rewards/fitness_reward/mean": 6.987481117248535, "rewards/fitness_reward/std": 2.113990306854248, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7534193396568298, "rewards/length2tails_reward/std": 0.2735549807548523, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2215192317962646, "rewards/thermo_reward/std": 1.3512930870056152, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.03125, "completions/mean_terminated_length": 273.03125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10589835606515408, "epoch": 1.942, "frac_reward_zero_std": 0.0, "grad_norm": 0.10970190912485123, "learning_rate": 1.3812020979708417e-06, "loss": -0.0011, "num_tokens": 8454906.0, "reward": 12.781929969787598, "reward_std": 2.1080613136291504, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4233293533325195, "rewards/kidney_reward/std": 0.5885559916496277, "rewards/length2tails_reward/mean": 0.7808050513267517, "rewards/length2tails_reward/std": 0.2904893755912781, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8193349838256836, "rewards/thermo_reward/std": 1.6535685062408447, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.1875, "completions/mean_terminated_length": 272.1875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09761006757616997, "epoch": 1.944, "frac_reward_zero_std": 0.0, "grad_norm": 0.1761743128299713, "learning_rate": 1.3800163251327849e-06, "loss": -0.0004, "num_tokens": 8463648.0, "reward": 12.051850318908691, "reward_std": 5.621099472045898, "rewards/fitness_reward/mean": 6.686112403869629, "rewards/fitness_reward/std": 2.667112112045288, "rewards/kidney_reward/mean": 2.3069543838500977, "rewards/kidney_reward/std": 1.2476999759674072, "rewards/length2tails_reward/mean": 0.7506591081619263, "rewards/length2tails_reward/std": 0.3087124824523926, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8837180137634277, "rewards/thermo_reward/std": 2.179638147354126, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.125, "completions/mean_terminated_length": 273.125, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.09852071199566126, "epoch": 1.946, "frac_reward_zero_std": 0.0, "grad_norm": 0.07111741602420807, "learning_rate": 1.3788299274533647e-06, "loss": -0.001, "num_tokens": 8472420.0, "reward": 13.54722785949707, "reward_std": 0.9824661612510681, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8295840620994568, "rewards/length2tails_reward/std": 0.19819991290569305, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4039645195007324, "rewards/thermo_reward/std": 0.9403955340385437, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.75, "completions/mean_terminated_length": 270.75, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "entropy": 0.09349171537905931, "epoch": 1.948, "frac_reward_zero_std": 0.0, "grad_norm": 0.46533671021461487, "learning_rate": 1.377642906883315e-06, "loss": -0.013, "num_tokens": 8481116.0, "reward": 13.117490768432617, "reward_std": 1.789286494255066, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.650642991065979, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.717496931552887, "rewards/length2tails_reward/std": 0.3229467272758484, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.100454807281494, "rewards/thermo_reward/std": 1.4970639944076538, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.59375, "completions/mean_terminated_length": 272.59375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09781383909285069, "epoch": 1.95, "frac_reward_zero_std": 0.0, "grad_norm": 0.05911961942911148, "learning_rate": 1.3764552653743919e-06, "loss": -0.0044, "num_tokens": 8489871.0, "reward": 13.45779800415039, "reward_std": 1.3013496398925781, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7833488583564758, "rewards/length2tails_reward/std": 0.2513625919818878, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.346517562866211, "rewards/thermo_reward/std": 1.1603977680206299, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.1875, "completions/mean_terminated_length": 272.1875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09864049591124058, "epoch": 1.952, "frac_reward_zero_std": 0.0, "grad_norm": 0.07618109881877899, "learning_rate": 1.3752670048793743e-06, "loss": -0.0041, "num_tokens": 8498613.0, "reward": 12.933008193969727, "reward_std": 2.295651435852051, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.517043113708496, "rewards/kidney_reward/std": 0.2941751182079315, "rewards/length2tails_reward/mean": 0.7475055456161499, "rewards/length2tails_reward/std": 0.30367472767829895, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8800292015075684, "rewards/thermo_reward/std": 1.998351812362671, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.78125, "completions/mean_terminated_length": 272.78125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10363372974097729, "epoch": 1.954, "frac_reward_zero_std": 0.0, "grad_norm": 0.07959149032831192, "learning_rate": 1.3740781273520572e-06, "loss": -0.006, "num_tokens": 8507374.0, "reward": 12.797253608703613, "reward_std": 2.882831335067749, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.3329405784606934, "rewards/kidney_reward/std": 0.9683598279953003, "rewards/length2tails_reward/mean": 0.7689581513404846, "rewards/length2tails_reward/std": 0.27883997559547424, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.983741283416748, "rewards/thermo_reward/std": 1.7046470642089844, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.0, "completions/mean_terminated_length": 272.0, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09072333294898272, "epoch": 1.956, "frac_reward_zero_std": 0.0, "grad_norm": 0.09165831655263901, "learning_rate": 1.3728886347472515e-06, "loss": 0.006, "num_tokens": 8516110.0, "reward": 13.664487838745117, "reward_std": 0.9979863166809082, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7295340895652771, "rewards/length2tails_reward/std": 0.2998608946800232, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.531228542327881, "rewards/thermo_reward/std": 0.8661776781082153, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.125, "completions/mean_terminated_length": 271.125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09883951116353273, "epoch": 1.958, "frac_reward_zero_std": 0.0, "grad_norm": 0.09821531176567078, "learning_rate": 1.3716985290207786e-06, "loss": -0.0055, "num_tokens": 8524818.0, "reward": 13.384990692138672, "reward_std": 1.777211308479309, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7037378549575806, "rewards/length2tails_reward/std": 0.26936420798301697, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2816712856292725, "rewards/thermo_reward/std": 1.5554665327072144, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.6875, "completions/mean_terminated_length": 272.6875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09577017091214657, "epoch": 1.96, "frac_reward_zero_std": 0.0, "grad_norm": 0.10303295403718948, "learning_rate": 1.3705078121294688e-06, "loss": -0.0015, "num_tokens": 8533576.0, "reward": 13.507298469543457, "reward_std": 1.4854562282562256, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8118197917938232, "rewards/length2tails_reward/std": 0.20147258043289185, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3931705951690674, "rewards/thermo_reward/std": 1.2787867784500122, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.03125, "completions/mean_terminated_length": 272.03125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10504948906600475, "epoch": 1.962, "frac_reward_zero_std": 0.0, "grad_norm": 0.12444677203893661, "learning_rate": 1.3693164860311562e-06, "loss": 0.0027, "num_tokens": 8542313.0, "reward": 13.287333488464355, "reward_std": 1.4708105325698853, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7800248861312866, "rewards/length2tails_reward/std": 0.22505053877830505, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.176384925842285, "rewards/thermo_reward/std": 1.3494049310684204, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.0, "completions/mean_terminated_length": 273.0, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.11124454904347658, "epoch": 1.964, "frac_reward_zero_std": 0.0, "grad_norm": 0.10324909538030624, "learning_rate": 1.3681245526846781e-06, "loss": -0.0022, "num_tokens": 8551081.0, "reward": 13.270578384399414, "reward_std": 1.4656739234924316, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.517043113708496, "rewards/kidney_reward/std": 0.2941751182079315, "rewards/length2tails_reward/mean": 0.800129771232605, "rewards/length2tails_reward/std": 0.2642292380332947, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.212337017059326, "rewards/thermo_reward/std": 1.2223610877990723, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.1875, "completions/mean_terminated_length": 273.1875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09352367091923952, "epoch": 1.966, "frac_reward_zero_std": 0.0, "grad_norm": 0.0937211737036705, "learning_rate": 1.3669320140498683e-06, "loss": -0.0034, "num_tokens": 8559855.0, "reward": 13.362506866455078, "reward_std": 1.3659077882766724, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7871782779693604, "rewards/length2tails_reward/std": 0.2870863676071167, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.250842571258545, "rewards/thermo_reward/std": 1.3097805976867676, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.15625, "completions/mean_terminated_length": 273.15625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09805367235094309, "epoch": 1.968, "frac_reward_zero_std": 0.0, "grad_norm": 0.07226665318012238, "learning_rate": 1.3657388720875579e-06, "loss": -0.0034, "num_tokens": 8568628.0, "reward": 13.506750106811523, "reward_std": 1.1540277004241943, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7862600088119507, "rewards/length2tails_reward/std": 0.2653719186782837, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3951773643493652, "rewards/thermo_reward/std": 0.9817285537719727, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 274.25, "completions/mean_terminated_length": 274.25, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10598085541278124, "epoch": 1.97, "frac_reward_zero_std": 0.0, "grad_norm": 0.07709791511297226, "learning_rate": 1.3645451287595686e-06, "loss": -0.0035, "num_tokens": 8577436.0, "reward": 13.531421661376953, "reward_std": 1.180441975593567, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8664608597755432, "rewards/length2tails_reward/std": 0.21417482197284698, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3571105003356934, "rewards/thermo_reward/std": 1.1683533191680908, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 274.03125, "completions/mean_terminated_length": 274.03125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.0898081511259079, "epoch": 1.972, "frac_reward_zero_std": 0.0, "grad_norm": 0.07521162182092667, "learning_rate": 1.3633507860287114e-06, "loss": -0.0047, "num_tokens": 8586237.0, "reward": 13.374664306640625, "reward_std": 1.6267354488372803, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8432106971740723, "rewards/length2tails_reward/std": 0.21972203254699707, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.257397413253784, "rewards/thermo_reward/std": 1.4419727325439453, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.125, "completions/mean_terminated_length": 272.125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09460345283150673, "epoch": 1.974, "frac_reward_zero_std": 0.0, "grad_norm": 0.10553532838821411, "learning_rate": 1.3621558458587833e-06, "loss": -0.0049, "num_tokens": 8594977.0, "reward": 12.886335372924805, "reward_std": 4.894617557525635, "rewards/fitness_reward/mean": 6.972713470458984, "rewards/fitness_reward/std": 2.197526216506958, "rewards/kidney_reward/mean": 2.406907796859741, "rewards/kidney_reward/std": 1.242085576057434, "rewards/length2tails_reward/mean": 0.7495771050453186, "rewards/length2tails_reward/std": 0.2758795917034149, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.331756591796875, "rewards/thermo_reward/std": 1.5028119087219238, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.71875, "completions/mean_terminated_length": 271.71875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09682532027363777, "epoch": 1.976, "frac_reward_zero_std": 0.0, "grad_norm": 0.13430586457252502, "learning_rate": 1.3609603102145623e-06, "loss": -0.0037, "num_tokens": 8603704.0, "reward": 12.990877151489258, "reward_std": 3.0743181705474854, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.5390896797180176, "rewards/kidney_reward/std": 0.49435171484947205, "rewards/length2tails_reward/mean": 0.7251232862472534, "rewards/length2tails_reward/std": 0.30888208746910095, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2262210845947266, "rewards/thermo_reward/std": 1.1746268272399902, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.21875, "completions/mean_terminated_length": 273.21875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10247461404651403, "epoch": 1.978, "frac_reward_zero_std": 0.0, "grad_norm": 0.1997416615486145, "learning_rate": 1.359764181061807e-06, "loss": -0.0028, "num_tokens": 8612479.0, "reward": 12.312665939331055, "reward_std": 3.448193311691284, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.452592670917511, "rewards/kidney_reward/mean": 2.255847454071045, "rewards/kidney_reward/std": 1.216636300086975, "rewards/length2tails_reward/mean": 0.788469672203064, "rewards/length2tails_reward/std": 0.3180660903453827, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.631805419921875, "rewards/thermo_reward/std": 2.0575854778289795, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.125, "completions/mean_terminated_length": 272.125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09506144933402538, "epoch": 1.98, "frac_reward_zero_std": 0.0, "grad_norm": 0.11303427815437317, "learning_rate": 1.3585674603672507e-06, "loss": 0.0027, "num_tokens": 8621219.0, "reward": 13.591766357421875, "reward_std": 1.0942988395690918, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7696424722671509, "rewards/length2tails_reward/std": 0.2485266625881195, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4818553924560547, "rewards/thermo_reward/std": 0.9271373748779297, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.53125, "completions/mean_terminated_length": 271.53125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09461696818470955, "epoch": 1.982, "frac_reward_zero_std": 0.0, "grad_norm": 0.13793237507343292, "learning_rate": 1.357370150098601e-06, "loss": -0.0026, "num_tokens": 8629940.0, "reward": 12.687845230102539, "reward_std": 4.5070953369140625, "rewards/fitness_reward/mean": 7.020533561706543, "rewards/fitness_reward/std": 1.927014708518982, "rewards/kidney_reward/mean": 2.4172534942626953, "rewards/kidney_reward/std": 1.0354214906692505, "rewards/length2tails_reward/mean": 0.7397796511650085, "rewards/length2tails_reward/std": 0.26660993695259094, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.076079845428467, "rewards/thermo_reward/std": 2.0029871463775635, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "entropy": 0.10359423235058784, "epoch": 1.984, "frac_reward_zero_std": 0.0, "grad_norm": 0.1719888299703598, "learning_rate": 1.3561722522245325e-06, "loss": 0.0036, "num_tokens": 8638684.0, "reward": 13.72146987915039, "reward_std": 0.5342023968696594, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8395423889160156, "rewards/length2tails_reward/std": 0.17699043452739716, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5498507022857666, "rewards/thermo_reward/std": 0.5360844731330872, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.1875, "completions/mean_terminated_length": 273.1875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08843657933175564, "epoch": 1.986, "frac_reward_zero_std": 0.0, "grad_norm": 0.16318239271640778, "learning_rate": 1.3549737687146882e-06, "loss": 0.0034, "num_tokens": 8647458.0, "reward": 13.68990707397461, "reward_std": 0.6028597950935364, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7975119352340698, "rewards/length2tails_reward/std": 0.2890626788139343, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5498507022857666, "rewards/thermo_reward/std": 0.5360844731330872, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.75, "completions/mean_terminated_length": 270.75, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "entropy": 0.10393639095127583, "epoch": 1.988, "frac_reward_zero_std": 0.0, "grad_norm": 0.06861487030982971, "learning_rate": 1.3537747015396723e-06, "loss": 0.0001, "num_tokens": 8656154.0, "reward": 12.781076431274414, "reward_std": 3.3552956581115723, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.511730670928955, "rewards/kidney_reward/std": 0.5132253766059875, "rewards/length2tails_reward/mean": 0.7422202825546265, "rewards/length2tails_reward/std": 0.2663733661174774, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0420706272125244, "rewards/thermo_reward/std": 1.7657049894332886, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 272.625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09462670609354973, "epoch": 1.99, "frac_reward_zero_std": 0.0, "grad_norm": 0.13636404275894165, "learning_rate": 1.3525750526710499e-06, "loss": -0.001, "num_tokens": 8664910.0, "reward": 12.781831741333008, "reward_std": 4.220463275909424, "rewards/fitness_reward/mean": 7.030411243438721, "rewards/fitness_reward/std": 1.871139645576477, "rewards/kidney_reward/mean": 2.3933980464935303, "rewards/kidney_reward/std": 1.021881341934204, "rewards/length2tails_reward/mean": 0.7944914102554321, "rewards/length2tails_reward/std": 0.25824275612831116, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1785738468170166, "rewards/thermo_reward/std": 1.5321305990219116, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 284.4375, "completions/mean_terminated_length": 284.4375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.1007428178563714, "epoch": 1.992, "frac_reward_zero_std": 0.0, "grad_norm": 0.5512227416038513, "learning_rate": 1.3513748240813427e-06, "loss": -0.0161, "num_tokens": 8674044.0, "reward": 13.84260368347168, "reward_std": 0.4883881211280823, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7290430068969727, "rewards/length2tails_reward/std": 0.30068859457969666, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09722385462373495, "epoch": 1.994, "frac_reward_zero_std": 0.0, "grad_norm": 0.07077227532863617, "learning_rate": 1.350174017744024e-06, "loss": -0.0004, "num_tokens": 8682772.0, "reward": 12.302651405334473, "reward_std": 5.062031269073486, "rewards/fitness_reward/mean": 7.00186824798584, "rewards/fitness_reward/std": 2.0326035022735596, "rewards/kidney_reward/mean": 2.200948476791382, "rewards/kidney_reward/std": 1.4013389348983765, "rewards/length2tails_reward/mean": 0.760050356388092, "rewards/length2tails_reward/std": 0.2809792160987854, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9238295555114746, "rewards/thermo_reward/std": 2.1210215091705322, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.6875, "completions/mean_terminated_length": 272.6875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09150166623294353, "epoch": 1.996, "frac_reward_zero_std": 0.0, "grad_norm": 0.07145685702562332, "learning_rate": 1.3489726356335189e-06, "loss": 0.0029, "num_tokens": 8691530.0, "reward": 13.876331329345703, "reward_std": 0.37368762493133545, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7927263975143433, "rewards/length2tails_reward/std": 0.25554361939430237, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.4375, "completions/mean_terminated_length": 270.4375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0959260631352663, "epoch": 1.998, "frac_reward_zero_std": 0.0, "grad_norm": 0.1562761813402176, "learning_rate": 1.3477706797251984e-06, "loss": -0.003, "num_tokens": 8700216.0, "reward": 12.899539947509766, "reward_std": 2.173999786376953, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.3832297325134277, "rewards/kidney_reward/std": 0.7602803111076355, "rewards/length2tails_reward/mean": 0.6400865912437439, "rewards/length2tails_reward/std": 0.33226698637008667, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9911162853240967, "rewards/thermo_reward/std": 1.7141780853271484, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 272.625, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 0.09887748025357723, "epoch": 2.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.08962114155292511, "learning_rate": 1.3465681519953763e-06, "loss": 0.0021, "num_tokens": 8708972.0, "reward": 13.58544921875, "reward_std": 0.9744566082954407, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8294492959976196, "rewards/length2tails_reward/std": 0.20147967338562012, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4421987533569336, "rewards/thermo_reward/std": 0.9380018711090088, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.03125, "completions/mean_terminated_length": 273.03125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09856892470270395, "epoch": 2.002, "frac_reward_zero_std": 0.0, "grad_norm": 0.1271350234746933, "learning_rate": 1.3453650544213076e-06, "loss": -0.0074, "num_tokens": 8717741.0, "reward": 13.097925186157227, "reward_std": 2.1939687728881836, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.650642991065979, "rewards/kidney_reward/mean": 2.4370408058166504, "rewards/kidney_reward/std": 0.6464345455169678, "rewards/length2tails_reward/mean": 0.7915678024291992, "rewards/length2tails_reward/std": 0.2875259518623352, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2355613708496094, "rewards/thermo_reward/std": 1.5196568965911865, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.5, "completions/mean_terminated_length": 273.5, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09953618701547384, "epoch": 2.004, "frac_reward_zero_std": 0.0, "grad_norm": 0.11234353482723236, "learning_rate": 1.3441613889811842e-06, "loss": 0.0046, "num_tokens": 8726525.0, "reward": 13.733948707580566, "reward_std": 0.576275110244751, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8390617370605469, "rewards/length2tails_reward/std": 0.21928992867469788, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.5625, "completions/mean_terminated_length": 272.5625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10064785089343786, "epoch": 2.006, "frac_reward_zero_std": 0.0, "grad_norm": 0.1869335174560547, "learning_rate": 1.3429571576541314e-06, "loss": 0.0006, "num_tokens": 8735279.0, "reward": 13.310781478881836, "reward_std": 2.0318822860717773, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.448695182800293, "rewards/kidney_reward/std": 0.7191016674041748, "rewards/length2tails_reward/mean": 0.7878056764602661, "rewards/length2tails_reward/std": 0.27009204030036926, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3796305656433105, "rewards/thermo_reward/std": 1.0566301345825195, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.03125, "completions/mean_terminated_length": 272.03125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08979780692607164, "epoch": 2.008, "frac_reward_zero_std": 0.0, "grad_norm": 0.05455811321735382, "learning_rate": 1.3417523624202052e-06, "loss": -0.0046, "num_tokens": 8744016.0, "reward": 13.631386756896973, "reward_std": 1.6224788427352905, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7611433267593384, "rewards/length2tails_reward/std": 0.2798122763633728, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.494967460632324, "rewards/thermo_reward/std": 1.4581682682037354, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.78125, "completions/mean_terminated_length": 272.78125, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.1012858347967267, "epoch": 2.01, "frac_reward_zero_std": 0.0, "grad_norm": 0.11689814925193787, "learning_rate": 1.3405470052603882e-06, "loss": -0.0019, "num_tokens": 8752777.0, "reward": 12.523555755615234, "reward_std": 4.099765300750732, "rewards/fitness_reward/mean": 7.006715774536133, "rewards/fitness_reward/std": 2.0051794052124023, "rewards/kidney_reward/mean": 2.3300538063049316, "rewards/kidney_reward/std": 0.943598747253418, "rewards/length2tails_reward/mean": 0.8091681003570557, "rewards/length2tails_reward/std": 0.2313581109046936, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0058703422546387, "rewards/thermo_reward/std": 1.6812902688980103, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.9375, "completions/mean_terminated_length": 272.9375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09508247300982475, "epoch": 2.012, "frac_reward_zero_std": 0.0, "grad_norm": 0.11960920691490173, "learning_rate": 1.3393410881565876e-06, "loss": -0.0001, "num_tokens": 8761543.0, "reward": 12.627235412597656, "reward_std": 4.367265701293945, "rewards/fitness_reward/mean": 7.018800258636475, "rewards/fitness_reward/std": 1.936821460723877, "rewards/kidney_reward/mean": 2.4195845127105713, "rewards/kidney_reward/std": 1.0223861932754517, "rewards/length2tails_reward/mean": 0.8042271137237549, "rewards/length2tails_reward/std": 0.22055809199810028, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0084283351898193, "rewards/thermo_reward/std": 1.5836304426193237, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.15625, "completions/mean_terminated_length": 271.15625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09306901833042502, "epoch": 2.014, "frac_reward_zero_std": 0.0, "grad_norm": 0.084152452647686, "learning_rate": 1.3381346130916314e-06, "loss": -0.006, "num_tokens": 8770252.0, "reward": 13.134542465209961, "reward_std": 3.140427350997925, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.511730670928955, "rewards/kidney_reward/std": 0.5132253766059875, "rewards/length2tails_reward/mean": 0.695670485496521, "rewards/length2tails_reward/std": 0.32462623715400696, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.400190591812134, "rewards/thermo_reward/std": 1.2556166648864746, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.3125, "completions/mean_terminated_length": 272.3125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09930686559528112, "epoch": 2.016, "frac_reward_zero_std": 0.0, "grad_norm": 0.10044628381729126, "learning_rate": 1.336927582049264e-06, "loss": 0.0009, "num_tokens": 8778998.0, "reward": 13.425021171569824, "reward_std": 1.7134307622909546, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.7415978908538818, "rewards/length2tails_reward/std": 0.33030208945274353, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.345273494720459, "rewards/thermo_reward/std": 1.483225703239441, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.34375, "completions/mean_terminated_length": 272.34375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08971796184778214, "epoch": 2.018, "frac_reward_zero_std": 0.0, "grad_norm": 0.06464128941297531, "learning_rate": 1.3357199970141454e-06, "loss": -0.0054, "num_tokens": 8787745.0, "reward": 13.278300285339355, "reward_std": 2.359668731689453, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.4579098224639893, "rewards/kidney_reward/std": 0.808746337890625, "rewards/length2tails_reward/mean": 0.7865937948226929, "rewards/length2tails_reward/std": 0.2422349900007248, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3380556106567383, "rewards/thermo_reward/std": 1.2652010917663574, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.5, "completions/mean_terminated_length": 272.5, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09284854214638472, "epoch": 2.02, "frac_reward_zero_std": 0.0, "grad_norm": 0.07684195041656494, "learning_rate": 1.3345118599718454e-06, "loss": -0.0006, "num_tokens": 8796497.0, "reward": 13.54580307006836, "reward_std": 1.1847339868545532, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8016017079353333, "rewards/length2tails_reward/std": 0.1992776095867157, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3779773712158203, "rewards/thermo_reward/std": 1.1855145692825317, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.96875, "completions/mean_terminated_length": 271.96875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09131715074181557, "epoch": 2.022, "frac_reward_zero_std": 0.0, "grad_norm": 0.11919866502285004, "learning_rate": 1.3333031729088417e-06, "loss": -0.0051, "num_tokens": 8805232.0, "reward": 13.67043399810791, "reward_std": 0.5742598176002502, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7280459403991699, "rewards/length2tails_reward/std": 0.3318087160587311, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.50996470451355, "rewards/thermo_reward/std": 0.5615194439888, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.46875, "completions/mean_terminated_length": 272.46875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.10009663831442595, "epoch": 2.024, "frac_reward_zero_std": 0.0, "grad_norm": 0.08270273357629776, "learning_rate": 1.3320939378125168e-06, "loss": -0.0059, "num_tokens": 8813983.0, "reward": 13.769775390625, "reward_std": 0.5637372136116028, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7984813451766968, "rewards/length2tails_reward/std": 0.25650182366371155, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.0625, "completions/mean_terminated_length": 273.0625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09755912981927395, "epoch": 2.026, "frac_reward_zero_std": 0.0, "grad_norm": 0.09963485598564148, "learning_rate": 1.3308841566711537e-06, "loss": -0.0049, "num_tokens": 8822753.0, "reward": 13.061192512512207, "reward_std": 2.7797274589538574, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.3953945636749268, "rewards/kidney_reward/std": 0.9332201480865479, "rewards/length2tails_reward/mean": 0.8119730949401855, "rewards/length2tails_reward/std": 0.2405090183019638, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.180924892425537, "rewards/thermo_reward/std": 1.625937581062317, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.10167609713971615, "epoch": 2.028, "frac_reward_zero_std": 0.0, "grad_norm": 0.1964738517999649, "learning_rate": 1.3296738314739338e-06, "loss": -0.0037, "num_tokens": 8831497.0, "reward": 12.929141998291016, "reward_std": 2.913825035095215, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.3505191802978516, "rewards/kidney_reward/std": 1.1123260259628296, "rewards/length2tails_reward/mean": 0.7692917585372925, "rewards/length2tails_reward/std": 0.2531833350658417, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0980172157287598, "rewards/thermo_reward/std": 1.6565762758255005, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09269763343036175, "epoch": 2.03, "frac_reward_zero_std": 0.0, "grad_norm": 0.09346351772546768, "learning_rate": 1.3284629642109324e-06, "loss": 0.0007, "num_tokens": 8840245.0, "reward": 12.098461151123047, "reward_std": 4.814784049987793, "rewards/fitness_reward/mean": 7.00302791595459, "rewards/fitness_reward/std": 2.0260438919067383, "rewards/kidney_reward/mean": 2.2101261615753174, "rewards/kidney_reward/std": 1.2059481143951416, "rewards/length2tails_reward/mean": 0.7802326679229736, "rewards/length2tails_reward/std": 0.24878345429897308, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.7072839736938477, "rewards/thermo_reward/std": 2.170975923538208, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.96875, "completions/mean_terminated_length": 271.96875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09186808485537767, "epoch": 2.032, "frac_reward_zero_std": 0.0, "grad_norm": 0.16771812736988068, "learning_rate": 1.327251556873117e-06, "loss": -0.0048, "num_tokens": 8848980.0, "reward": 12.41139030456543, "reward_std": 4.630665302276611, "rewards/fitness_reward/mean": 6.72658634185791, "rewards/fitness_reward/std": 2.49751615524292, "rewards/kidney_reward/mean": 2.326782464981079, "rewards/kidney_reward/std": 1.0067613124847412, "rewards/length2tails_reward/mean": 0.7035547494888306, "rewards/length2tails_reward/std": 0.3279150128364563, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1876654624938965, "rewards/thermo_reward/std": 1.489618182182312, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.34375, "completions/mean_terminated_length": 271.34375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.10063336603343487, "epoch": 2.034, "frac_reward_zero_std": 0.0, "grad_norm": 0.11937062442302704, "learning_rate": 1.3260396114523417e-06, "loss": 0.0058, "num_tokens": 8857695.0, "reward": 13.733278274536133, "reward_std": 0.578973114490509, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8323647975921631, "rewards/length2tails_reward/std": 0.20198071002960205, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.21875, "completions/mean_terminated_length": 273.21875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09128071740269661, "epoch": 2.036, "frac_reward_zero_std": 0.0, "grad_norm": 0.16607420146465302, "learning_rate": 1.3248271299413474e-06, "loss": 0.0021, "num_tokens": 8866470.0, "reward": 13.388148307800293, "reward_std": 1.5452548265457153, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.8022655248641968, "rewards/length2tails_reward/std": 0.25334814190864563, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3023343086242676, "rewards/thermo_reward/std": 1.3248815536499023, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 272.75, "completions/mean_terminated_length": 272.75, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.10503023117780685, "epoch": 2.038, "frac_reward_zero_std": 0.0, "grad_norm": 0.0910627543926239, "learning_rate": 1.323614114333754e-06, "loss": 0.0009, "num_tokens": 8875230.0, "reward": 13.491312980651855, "reward_std": 0.6788957118988037, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8058663010597229, "rewards/length2tails_reward/std": 0.21435952186584473, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3504209518432617, "rewards/thermo_reward/std": 0.6277978420257568, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.5625, "completions/mean_terminated_length": 272.5625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.09710283949971199, "epoch": 2.04, "frac_reward_zero_std": 0.0, "grad_norm": 0.10189030319452286, "learning_rate": 1.3224005666240623e-06, "loss": -0.0023, "num_tokens": 8883984.0, "reward": 13.674921035766602, "reward_std": 0.5684311985969543, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.772911787033081, "rewards/length2tails_reward/std": 0.2503526210784912, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.50996470451355, "rewards/thermo_reward/std": 0.5615194439888, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 274.0, "completions/mean_terminated_length": 274.0, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.10056890733540058, "epoch": 2.042, "frac_reward_zero_std": 0.0, "grad_norm": 0.121220663189888, "learning_rate": 1.3211864888076456e-06, "loss": -0.0001, "num_tokens": 8892784.0, "reward": 13.64167594909668, "reward_std": 0.5900519490242004, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8393223285675049, "rewards/length2tails_reward/std": 0.24542348086833954, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.470078706741333, "rewards/thermo_reward/std": 0.5830413699150085, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 272.59375, "completions/mean_terminated_length": 272.59375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09844275377690792, "epoch": 2.044, "frac_reward_zero_std": 0.0, "grad_norm": 0.09364219754934311, "learning_rate": 1.31997188288075e-06, "loss": -0.0015, "num_tokens": 8901539.0, "reward": 13.416436195373535, "reward_std": 1.4693506956100464, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8285934925079346, "rewards/length2tails_reward/std": 0.2166094034910202, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.300631046295166, "rewards/thermo_reward/std": 1.3310917615890503, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.9375, "completions/mean_terminated_length": 271.9375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09990180004388094, "epoch": 2.046, "frac_reward_zero_std": 0.0, "grad_norm": 0.07670339941978455, "learning_rate": 1.3187567508404898e-06, "loss": -0.0037, "num_tokens": 8910273.0, "reward": 13.480457305908203, "reward_std": 1.139972448348999, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.761543333530426, "rewards/length2tails_reward/std": 0.2405269891023636, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.316638231277466, "rewards/thermo_reward/std": 1.1297534704208374, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.59375, "completions/mean_terminated_length": 272.59375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09408196154981852, "epoch": 2.048, "frac_reward_zero_std": 0.0, "grad_norm": 0.0961371660232544, "learning_rate": 1.3175410946848444e-06, "loss": 0.0005, "num_tokens": 8919028.0, "reward": 13.795574188232422, "reward_std": 0.46939560770988464, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7828723192214966, "rewards/length2tails_reward/std": 0.2605209946632385, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.96875, "completions/mean_terminated_length": 271.96875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08972968813031912, "epoch": 2.05, "frac_reward_zero_std": 0.0, "grad_norm": 0.10491012036800385, "learning_rate": 1.3163249164126547e-06, "loss": -0.0046, "num_tokens": 8927763.0, "reward": 13.144769668579102, "reward_std": 2.383373975753784, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.466433048248291, "rewards/kidney_reward/std": 0.7614781260490417, "rewards/length2tails_reward/mean": 0.7181075811386108, "rewards/length2tails_reward/std": 0.3215291500091553, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2028496265411377, "rewards/thermo_reward/std": 1.4257335662841797, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.75, "completions/mean_terminated_length": 270.75, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 0.09596592746675014, "epoch": 2.052, "frac_reward_zero_std": 0.0, "grad_norm": 0.31508520245552063, "learning_rate": 1.3151082180236209e-06, "loss": -0.0119, "num_tokens": 8936459.0, "reward": 13.793975830078125, "reward_std": 0.46918433904647827, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7668845653533936, "rewards/length2tails_reward/std": 0.2270083874464035, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08913787268102169, "epoch": 2.054, "frac_reward_zero_std": 0.0, "grad_norm": 0.07905755937099457, "learning_rate": 1.3138910015182968e-06, "loss": -0.0013, "num_tokens": 8945207.0, "reward": 12.620532989501953, "reward_std": 3.6539318561553955, "rewards/fitness_reward/mean": 7.188657283782959, "rewards/fitness_reward/std": 0.7179933190345764, "rewards/kidney_reward/mean": 2.3732101917266846, "rewards/kidney_reward/std": 1.041853427886963, "rewards/length2tails_reward/mean": 0.767655611038208, "rewards/length2tails_reward/std": 0.29074984788894653, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.8819003105163574, "rewards/thermo_reward/std": 2.097299575805664, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.90625, "completions/mean_terminated_length": 271.90625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10382816102355719, "epoch": 2.056, "frac_reward_zero_std": 0.0, "grad_norm": 0.08299266546964645, "learning_rate": 1.31267326889809e-06, "loss": -0.0031, "num_tokens": 8953940.0, "reward": 13.17679214477539, "reward_std": 1.736342430114746, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5099523067474365, "rewards/kidney_reward/std": 0.5228261947631836, "rewards/length2tails_reward/mean": 0.7341771125793457, "rewards/length2tails_reward/std": 0.27263689041137695, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.132237434387207, "rewards/thermo_reward/std": 1.3638969659805298, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.65625, "completions/mean_terminated_length": 272.65625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09531169943511486, "epoch": 2.058, "frac_reward_zero_std": 0.0, "grad_norm": 0.08293966203927994, "learning_rate": 1.3114550221652552e-06, "loss": -0.001, "num_tokens": 8962697.0, "reward": 13.612466812133789, "reward_std": 1.2178254127502441, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7928884029388428, "rewards/length2tails_reward/std": 0.22859860956668854, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.472871780395508, "rewards/thermo_reward/std": 1.1119534969329834, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.53125, "completions/mean_terminated_length": 273.53125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09332179836928844, "epoch": 2.06, "frac_reward_zero_std": 0.0, "grad_norm": 0.1411670595407486, "learning_rate": 1.3102362633228933e-06, "loss": 0.0021, "num_tokens": 8971482.0, "reward": 13.582618713378906, "reward_std": 1.0881257057189941, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8324671387672424, "rewards/length2tails_reward/std": 0.23901960253715515, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4390664100646973, "rewards/thermo_reward/std": 0.952990710735321, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.71875, "completions/mean_terminated_length": 271.71875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08503109263256192, "epoch": 2.062, "frac_reward_zero_std": 0.0, "grad_norm": 0.1336006224155426, "learning_rate": 1.3090169943749473e-06, "loss": -0.0068, "num_tokens": 8980209.0, "reward": 13.105493545532227, "reward_std": 2.8300795555114746, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.4735350608825684, "rewards/kidney_reward/std": 0.8651843667030334, "rewards/length2tails_reward/mean": 0.7215414047241211, "rewards/length2tails_reward/std": 0.294627845287323, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.156128168106079, "rewards/thermo_reward/std": 1.8703521490097046, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.0625, "completions/mean_terminated_length": 272.0625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0927619244903326, "epoch": 2.064, "frac_reward_zero_std": 0.0, "grad_norm": 0.08014991134405136, "learning_rate": 1.3077972173261983e-06, "loss": -0.0042, "num_tokens": 8988947.0, "reward": 13.46107292175293, "reward_std": 1.571915864944458, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5046491622924805, "rewards/kidney_reward/std": 0.5515542030334473, "rewards/length2tails_reward/mean": 0.7611790895462036, "rewards/length2tails_reward/std": 0.2673257291316986, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4191207885742188, "rewards/thermo_reward/std": 1.0504286289215088, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.40625, "completions/mean_terminated_length": 272.40625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09465755335986614, "epoch": 2.066, "frac_reward_zero_std": 0.0, "grad_norm": 0.07499520480632782, "learning_rate": 1.3065769341822632e-06, "loss": -0.0006, "num_tokens": 8997696.0, "reward": 13.690796852111816, "reward_std": 0.8897803425788879, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.767320990562439, "rewards/length2tails_reward/std": 0.24395054578781128, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.526400089263916, "rewards/thermo_reward/std": 0.8900278806686401, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.08990177698433399, "epoch": 2.068, "frac_reward_zero_std": 0.0, "grad_norm": 0.11101987957954407, "learning_rate": 1.305356146949591e-06, "loss": 0.0041, "num_tokens": 9006424.0, "reward": 13.398303985595703, "reward_std": 1.4064549207687378, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7223544120788574, "rewards/length2tails_reward/std": 0.29692718386650085, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2931222915649414, "rewards/thermo_reward/std": 1.2151144742965698, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.90625, "completions/mean_terminated_length": 272.90625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09254649002104998, "epoch": 2.07, "frac_reward_zero_std": 0.0, "grad_norm": 0.13691101968288422, "learning_rate": 1.3041348576354594e-06, "loss": 0.0028, "num_tokens": 9015189.0, "reward": 13.770145416259766, "reward_std": 0.5533500909805298, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8021765947341919, "rewards/length2tails_reward/std": 0.2569216191768646, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.5, "completions/mean_terminated_length": 272.5, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09089200105518103, "epoch": 2.072, "frac_reward_zero_std": 0.0, "grad_norm": 0.05383050814270973, "learning_rate": 1.302913068247972e-06, "loss": -0.0063, "num_tokens": 9023941.0, "reward": 12.733110427856445, "reward_std": 3.2412171363830566, "rewards/fitness_reward/mean": 6.99554443359375, "rewards/fitness_reward/std": 1.7628074884414673, "rewards/kidney_reward/mean": 2.374934673309326, "rewards/kidney_reward/std": 0.579054057598114, "rewards/length2tails_reward/mean": 0.7359942197799683, "rewards/length2tails_reward/std": 0.3207305073738098, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1890320777893066, "rewards/thermo_reward/std": 1.321785569190979, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.1875, "completions/mean_terminated_length": 272.1875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09400006104260683, "epoch": 2.074, "frac_reward_zero_std": 0.0, "grad_norm": 0.19333302974700928, "learning_rate": 1.3016907807960549e-06, "loss": -0.0012, "num_tokens": 9032683.0, "reward": 12.365195274353027, "reward_std": 6.029626369476318, "rewards/fitness_reward/mean": 6.920212745666504, "rewards/fitness_reward/std": 2.4945178031921387, "rewards/kidney_reward/mean": 2.205441474914551, "rewards/kidney_reward/std": 1.681620717048645, "rewards/length2tails_reward/mean": 0.7598259449005127, "rewards/length2tails_reward/std": 0.2702024579048157, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.063559055328369, "rewards/thermo_reward/std": 2.0836944580078125, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.03125, "completions/mean_terminated_length": 272.03125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.0850873845629394, "epoch": 2.076, "frac_reward_zero_std": 0.0, "grad_norm": 0.3471170663833618, "learning_rate": 1.3004679972894518e-06, "loss": -0.004, "num_tokens": 9041420.0, "reward": 13.284088134765625, "reward_std": 2.320343255996704, "rewards/fitness_reward/mean": 7.049720287322998, "rewards/fitness_reward/std": 1.7619104385375977, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7075710296630859, "rewards/length2tails_reward/std": 0.2854107916355133, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.464489221572876, "rewards/thermo_reward/std": 1.0130728483200073, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.90625, "completions/mean_terminated_length": 270.90625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09551345184445381, "epoch": 2.078, "frac_reward_zero_std": 0.0, "grad_norm": 0.08493397384881973, "learning_rate": 1.2992447197387238e-06, "loss": -0.0022, "num_tokens": 9050121.0, "reward": 13.317670822143555, "reward_std": 2.1450092792510986, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.445978879928589, "rewards/kidney_reward/std": 0.7337779402732849, "rewards/length2tails_reward/mean": 0.6894269585609436, "rewards/length2tails_reward/std": 0.3035351037979126, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3415637016296387, "rewards/thermo_reward/std": 1.474583625793457, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 274.46875, "completions/mean_terminated_length": 274.46875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.1033138744533062, "epoch": 2.08, "frac_reward_zero_std": 0.0, "grad_norm": 0.09677430987358093, "learning_rate": 1.2980209501552426e-06, "loss": -0.0012, "num_tokens": 9058936.0, "reward": 13.546756744384766, "reward_std": 1.086491584777832, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.8765187859535217, "rewards/length2tails_reward/std": 0.17953187227249146, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.453517198562622, "rewards/thermo_reward/std": 0.8846840858459473, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.8125, "completions/mean_terminated_length": 271.8125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08544276189059019, "epoch": 2.082, "frac_reward_zero_std": 0.0, "grad_norm": 0.10554521530866623, "learning_rate": 1.2967966905511905e-06, "loss": -0.0064, "num_tokens": 9067666.0, "reward": 13.696599960327148, "reward_std": 1.0239237546920776, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.690746009349823, "rewards/length2tails_reward/std": 0.35588833689689636, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5672197341918945, "rewards/thermo_reward/std": 0.8686692714691162, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.6875, "completions/mean_terminated_length": 270.6875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09616864379495382, "epoch": 2.084, "frac_reward_zero_std": 0.0, "grad_norm": 0.12321001291275024, "learning_rate": 1.2955719429395546e-06, "loss": -0.0042, "num_tokens": 9076360.0, "reward": 13.158604621887207, "reward_std": 2.375380516052246, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.486342430114746, "rewards/kidney_reward/std": 0.651604175567627, "rewards/length2tails_reward/mean": 0.700831413269043, "rewards/length2tails_reward/std": 0.3187938928604126, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.140994071960449, "rewards/thermo_reward/std": 1.8229484558105469, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.3125, "completions/mean_terminated_length": 271.3125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "entropy": 0.10125172417610884, "epoch": 2.086, "frac_reward_zero_std": 0.0, "grad_norm": 0.1306227445602417, "learning_rate": 1.2943467093341244e-06, "loss": 0.0002, "num_tokens": 9085074.0, "reward": 12.651572227478027, "reward_std": 5.144802093505859, "rewards/fitness_reward/mean": 6.965117454528809, "rewards/fitness_reward/std": 2.240497589111328, "rewards/kidney_reward/mean": 2.4036145210266113, "rewards/kidney_reward/std": 1.2607145309448242, "rewards/length2tails_reward/mean": 0.7663254737854004, "rewards/length2tails_reward/std": 0.2889622747898102, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1062076091766357, "rewards/thermo_reward/std": 1.9814021587371826, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10001961886882782, "epoch": 2.088, "frac_reward_zero_std": 0.0, "grad_norm": 0.09242808818817139, "learning_rate": 1.2931209917494894e-06, "loss": -0.0029, "num_tokens": 9093802.0, "reward": 13.539291381835938, "reward_std": 1.2714763879776, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7442175149917603, "rewards/length2tails_reward/std": 0.29208293557167053, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.377204418182373, "rewards/thermo_reward/std": 1.263572335243225, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.125, "completions/mean_terminated_length": 271.125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.0941868107765913, "epoch": 2.09, "frac_reward_zero_std": 0.0, "grad_norm": 0.0928439348936081, "learning_rate": 1.2918947922010336e-06, "loss": -0.0035, "num_tokens": 9102510.0, "reward": 13.199728012084961, "reward_std": 2.9908151626586914, "rewards/fitness_reward/mean": 7.01439094543457, "rewards/fitness_reward/std": 1.9617626667022705, "rewards/kidney_reward/mean": 2.4852182865142822, "rewards/kidney_reward/std": 0.6577827334403992, "rewards/length2tails_reward/mean": 0.7584496736526489, "rewards/length2tails_reward/std": 0.2767479121685028, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.524273633956909, "rewards/thermo_reward/std": 1.296755313873291, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.25, "completions/mean_terminated_length": 271.25, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "entropy": 0.09199702274054289, "epoch": 2.092, "frac_reward_zero_std": 0.0, "grad_norm": 0.10496368259191513, "learning_rate": 1.2906681127049338e-06, "loss": 0.0011, "num_tokens": 9111222.0, "reward": 13.468603134155273, "reward_std": 1.3402777910232544, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7977195382118225, "rewards/length2tails_reward/std": 0.25168755650520325, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3558850288391113, "rewards/thermo_reward/std": 1.1745244264602661, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.78125, "completions/mean_terminated_length": 273.78125, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.08931320626288652, "epoch": 2.094, "frac_reward_zero_std": 0.0, "grad_norm": 0.09162434935569763, "learning_rate": 1.2894409552781564e-06, "loss": -0.0006, "num_tokens": 9120015.0, "reward": 13.722648620605469, "reward_std": 0.9620515704154968, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8560950756072998, "rewards/length2tails_reward/std": 0.16277632117271423, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.549374580383301, "rewards/thermo_reward/std": 0.9605962038040161, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.65625, "completions/mean_terminated_length": 271.65625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10179832112044096, "epoch": 2.096, "frac_reward_zero_std": 0.0, "grad_norm": 0.15232229232788086, "learning_rate": 1.2882133219384538e-06, "loss": -0.0009, "num_tokens": 9128740.0, "reward": 13.005573272705078, "reward_std": 4.204520225524902, "rewards/fitness_reward/mean": 7.045802116394043, "rewards/fitness_reward/std": 1.7840752601623535, "rewards/kidney_reward/mean": 2.449796438217163, "rewards/kidney_reward/std": 0.9994713068008423, "rewards/length2tails_reward/mean": 0.7428468465805054, "rewards/length2tails_reward/std": 0.2785884141921997, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3356895446777344, "rewards/thermo_reward/std": 1.4818718433380127, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.65625, "completions/mean_terminated_length": 272.65625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09577594231814146, "epoch": 2.098, "frac_reward_zero_std": 0.0, "grad_norm": 0.07349243760108948, "learning_rate": 1.2869852147043605e-06, "loss": 0.0014, "num_tokens": 9137497.0, "reward": 13.581193923950195, "reward_std": 1.2682756185531616, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7896881103515625, "rewards/length2tails_reward/std": 0.251890629529953, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4419198036193848, "rewards/thermo_reward/std": 1.1277644634246826, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.65625, "completions/mean_terminated_length": 272.65625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08849729225039482, "epoch": 2.1, "frac_reward_zero_std": 0.0, "grad_norm": 0.10994908213615417, "learning_rate": 1.2857566355951903e-06, "loss": -0.0046, "num_tokens": 9146254.0, "reward": 13.484933853149414, "reward_std": 1.3967456817626953, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7338998317718506, "rewards/length2tails_reward/std": 0.3522545397281647, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.35123872756958, "rewards/thermo_reward/std": 1.2868434190750122, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.0, "completions/mean_terminated_length": 273.0, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08763735927641392, "epoch": 2.102, "frac_reward_zero_std": 0.0, "grad_norm": 0.08444438874721527, "learning_rate": 1.2845275866310324e-06, "loss": -0.0005, "num_tokens": 9155022.0, "reward": 12.240044593811035, "reward_std": 5.094367504119873, "rewards/fitness_reward/mean": 6.943960189819336, "rewards/fitness_reward/std": 2.050309658050537, "rewards/kidney_reward/mean": 2.2489984035491943, "rewards/kidney_reward/std": 1.3802794218063354, "rewards/length2tails_reward/mean": 0.7449901103973389, "rewards/length2tails_reward/std": 0.31968122720718384, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.872586727142334, "rewards/thermo_reward/std": 2.0441460609436035, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.125, "completions/mean_terminated_length": 272.125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09333954565227032, "epoch": 2.104, "frac_reward_zero_std": 0.0, "grad_norm": 0.058305464684963226, "learning_rate": 1.2832980698327494e-06, "loss": 0.0052, "num_tokens": 9163762.0, "reward": 13.915213584899902, "reward_std": 0.30910414457321167, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7826879620552063, "rewards/length2tails_reward/std": 0.2214636653661728, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.0625, "completions/mean_terminated_length": 272.0625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.1027986891567707, "epoch": 2.106, "frac_reward_zero_std": 0.0, "grad_norm": 0.10487403720617294, "learning_rate": 1.2820680872219728e-06, "loss": 0.002, "num_tokens": 9172500.0, "reward": 12.580307960510254, "reward_std": 2.7758026123046875, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4055867195129395, "rewards/kidney_reward/std": 0.7715975046157837, "rewards/length2tails_reward/mean": 0.6917127370834351, "rewards/length2tails_reward/std": 0.3424265384674072, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.6443653106689453, "rewards/thermo_reward/std": 2.147477865219116, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.5, "completions/mean_terminated_length": 273.5, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09246054664254189, "epoch": 2.108, "frac_reward_zero_std": 0.0, "grad_norm": 0.1013147234916687, "learning_rate": 1.2808376408210994e-06, "loss": -0.0001, "num_tokens": 9181284.0, "reward": 13.547618865966797, "reward_std": 1.146050214767456, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8119996786117554, "rewards/length2tails_reward/std": 0.21671175956726074, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.406114101409912, "rewards/thermo_reward/std": 1.1155322790145874, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.90625, "completions/mean_terminated_length": 271.90625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09602517727762461, "epoch": 2.11, "frac_reward_zero_std": 0.0, "grad_norm": 0.0790761411190033, "learning_rate": 1.27960673265329e-06, "loss": -0.0023, "num_tokens": 9190017.0, "reward": 13.315729141235352, "reward_std": 1.9026858806610107, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.5110602378845215, "rewards/kidney_reward/std": 0.5168427228927612, "rewards/length2tails_reward/mean": 0.7272899150848389, "rewards/length2tails_reward/std": 0.3061966896057129, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.328263759613037, "rewards/thermo_reward/std": 1.1189709901809692, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.25, "completions/mean_terminated_length": 273.25, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09410333167761564, "epoch": 2.112, "frac_reward_zero_std": 0.0, "grad_norm": 0.10358601808547974, "learning_rate": 1.2783753647424632e-06, "loss": -0.0006, "num_tokens": 9198793.0, "reward": 13.800519943237305, "reward_std": 0.47215381264686584, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8323312401771545, "rewards/length2tails_reward/std": 0.21438726782798767, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.4375, "completions/mean_terminated_length": 270.4375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.09133266471326351, "epoch": 2.114, "frac_reward_zero_std": 0.0, "grad_norm": 0.12086813151836395, "learning_rate": 1.2771435391132943e-06, "loss": -0.0026, "num_tokens": 9207479.0, "reward": 13.8716459274292, "reward_std": 0.38527801632881165, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7458708882331848, "rewards/length2tails_reward/std": 0.27414339780807495, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 270.65625, "completions/mean_terminated_length": 270.65625, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "entropy": 0.10229693353176117, "epoch": 2.116, "frac_reward_zero_std": 0.0, "grad_norm": 0.08657363057136536, "learning_rate": 1.275911257791211e-06, "loss": -0.0049, "num_tokens": 9216172.0, "reward": 12.801111221313477, "reward_std": 3.3229711055755615, "rewards/fitness_reward/mean": 6.99554443359375, "rewards/fitness_reward/std": 1.7628074884414673, "rewards/kidney_reward/mean": 2.3960742950439453, "rewards/kidney_reward/std": 0.7102473974227905, "rewards/length2tails_reward/mean": 0.6917217969894409, "rewards/length2tails_reward/std": 0.32955726981163025, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2403202056884766, "rewards/thermo_reward/std": 1.3757102489471436, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.15625, "completions/mean_terminated_length": 273.15625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.10404617059975863, "epoch": 2.118, "frac_reward_zero_std": 0.0, "grad_norm": 0.0734260156750679, "learning_rate": 1.2746785228023901e-06, "loss": -0.0053, "num_tokens": 9224945.0, "reward": 13.67131233215332, "reward_std": 1.030469536781311, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8276180624961853, "rewards/length2tails_reward/std": 0.2293540984392166, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.500884771347046, "rewards/thermo_reward/std": 1.0189565420150757, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.78125, "completions/mean_terminated_length": 271.78125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08712383266538382, "epoch": 2.12, "frac_reward_zero_std": 0.0, "grad_norm": 0.10176970064640045, "learning_rate": 1.2734453361737551e-06, "loss": -0.0046, "num_tokens": 9233674.0, "reward": 13.347003936767578, "reward_std": 1.3631689548492432, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5095105171203613, "rewards/kidney_reward/std": 0.5252142548561096, "rewards/length2tails_reward/mean": 0.6960784196853638, "rewards/length2tails_reward/std": 0.329012930393219, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3067002296447754, "rewards/thermo_reward/std": 1.1650335788726807, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.28125, "completions/mean_terminated_length": 272.28125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0984044810757041, "epoch": 2.122, "frac_reward_zero_std": 0.0, "grad_norm": 0.1514298915863037, "learning_rate": 1.272211699932971e-06, "loss": -0.0008, "num_tokens": 9242419.0, "reward": 12.934194564819336, "reward_std": 2.576376438140869, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.3321802616119385, "rewards/kidney_reward/std": 0.9315749406814575, "rewards/length2tails_reward/mean": 0.7589224576950073, "rewards/length2tails_reward/std": 0.3022225499153137, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.064936637878418, "rewards/thermo_reward/std": 1.6775540113449097, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.90625, "completions/mean_terminated_length": 270.90625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08853981457650661, "epoch": 2.124, "frac_reward_zero_std": 0.0, "grad_norm": 0.19971629977226257, "learning_rate": 1.2709776161084432e-06, "loss": -0.0038, "num_tokens": 9251120.0, "reward": 12.858133316040039, "reward_std": 3.0192081928253174, "rewards/fitness_reward/mean": 7.05157995223999, "rewards/fitness_reward/std": 1.7513916492462158, "rewards/kidney_reward/mean": 2.517043113708496, "rewards/kidney_reward/std": 0.2941751182079315, "rewards/length2tails_reward/mean": 0.6278231739997864, "rewards/length2tails_reward/std": 0.35386279225349426, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1267285346984863, "rewards/thermo_reward/std": 1.473052740097046, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.09375, "completions/mean_terminated_length": 272.09375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0893591595813632, "epoch": 2.126, "frac_reward_zero_std": 0.0, "grad_norm": 0.06807100772857666, "learning_rate": 1.2697430867293118e-06, "loss": -0.0046, "num_tokens": 9259859.0, "reward": 12.771037101745605, "reward_std": 3.42122220993042, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.457012176513672, "rewards/kidney_reward/std": 0.5447914004325867, "rewards/length2tails_reward/mean": 0.7093701362609863, "rewards/length2tails_reward/std": 0.30901870131492615, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.090034246444702, "rewards/thermo_reward/std": 1.7447394132614136, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.21875, "completions/mean_terminated_length": 272.21875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09699106961488724, "epoch": 2.128, "frac_reward_zero_std": 0.0, "grad_norm": 0.09156450629234314, "learning_rate": 1.2685081138254504e-06, "loss": 0.0015, "num_tokens": 9268602.0, "reward": 13.582111358642578, "reward_std": 1.680040955543518, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5325703620910645, "rewards/kidney_reward/std": 0.5312304496765137, "rewards/length2tails_reward/mean": 0.7863272428512573, "rewards/length2tails_reward/std": 0.2618371248245239, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5097227096557617, "rewards/thermo_reward/std": 1.1701918840408325, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.125, "completions/mean_terminated_length": 272.125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09031611122190952, "epoch": 2.13, "frac_reward_zero_std": 0.0, "grad_norm": 0.08338166028261185, "learning_rate": 1.267272699427462e-06, "loss": -0.0009, "num_tokens": 9277342.0, "reward": 13.708481788635254, "reward_std": 0.8082547187805176, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7606251835823059, "rewards/length2tails_reward/std": 0.26824161410331726, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.6875, "completions/mean_terminated_length": 272.6875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08373354189097881, "epoch": 2.132, "frac_reward_zero_std": 0.0, "grad_norm": 0.08093984425067902, "learning_rate": 1.266036845566675e-06, "loss": -0.0035, "num_tokens": 9286100.0, "reward": 12.996624946594238, "reward_std": 4.823812961578369, "rewards/fitness_reward/mean": 7.006075859069824, "rewards/fitness_reward/std": 2.008800983428955, "rewards/kidney_reward/mean": 2.3853907585144043, "rewards/kidney_reward/std": 1.2139407396316528, "rewards/length2tails_reward/mean": 0.7885721325874329, "rewards/length2tails_reward/std": 0.2286292165517807, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.426300525665283, "rewards/thermo_reward/std": 1.6240335702896118, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.53125, "completions/mean_terminated_length": 272.53125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09700395073741674, "epoch": 2.134, "frac_reward_zero_std": 0.0, "grad_norm": 0.20416933298110962, "learning_rate": 1.2648005542751405e-06, "loss": 0.0052, "num_tokens": 9294853.0, "reward": 13.65412712097168, "reward_std": 1.079277753829956, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7824170589447021, "rewards/length2tails_reward/std": 0.2475566267967224, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.515580177307129, "rewards/thermo_reward/std": 0.9441563487052917, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.65625, "completions/mean_terminated_length": 272.65625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0891966512426734, "epoch": 2.136, "frac_reward_zero_std": 0.0, "grad_norm": 0.06771207600831985, "learning_rate": 1.2635638275856287e-06, "loss": -0.0061, "num_tokens": 9303610.0, "reward": 12.930241584777832, "reward_std": 2.634490966796875, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4129862785339355, "rewards/kidney_reward/std": 0.7415686249732971, "rewards/length2tails_reward/mean": 0.7700674533843994, "rewards/length2tails_reward/std": 0.28495827317237854, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.979062795639038, "rewards/thermo_reward/std": 1.927284598350525, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.03125, "completions/mean_terminated_length": 271.03125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "entropy": 0.09346739295870066, "epoch": 2.138, "frac_reward_zero_std": 0.0, "grad_norm": 0.20411282777786255, "learning_rate": 1.2623266675316263e-06, "loss": -0.0098, "num_tokens": 9312315.0, "reward": 13.800832748413086, "reward_std": 0.47488632798194885, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8354582190513611, "rewards/length2tails_reward/std": 0.2633455991744995, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.65625, "completions/mean_terminated_length": 272.65625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0999509422108531, "epoch": 2.14, "frac_reward_zero_std": 0.0, "grad_norm": 0.05981971323490143, "learning_rate": 1.2610890761473315e-06, "loss": -0.0048, "num_tokens": 9321072.0, "reward": 13.200286865234375, "reward_std": 2.0932772159576416, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4858555793762207, "rewards/kidney_reward/std": 0.6542784571647644, "rewards/length2tails_reward/mean": 0.730256199836731, "rewards/length2tails_reward/std": 0.32691308856010437, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.180220603942871, "rewards/thermo_reward/std": 1.4679925441741943, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.625, "completions/mean_terminated_length": 269.625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "entropy": 0.09472297970205545, "epoch": 2.142, "frac_reward_zero_std": 0.0, "grad_norm": 0.09255687892436981, "learning_rate": 1.2598510554676528e-06, "loss": -0.0027, "num_tokens": 9329732.0, "reward": 13.429327011108398, "reward_std": 1.4543567895889282, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.711338222026825, "rewards/length2tails_reward/std": 0.29834526777267456, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2978878021240234, "rewards/thermo_reward/std": 1.341590166091919, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.8125, "completions/mean_terminated_length": 271.8125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.09138771425932646, "epoch": 2.144, "frac_reward_zero_std": 0.0, "grad_norm": 0.0674971267580986, "learning_rate": 1.2586126075282045e-06, "loss": -0.0051, "num_tokens": 9338462.0, "reward": 13.338953018188477, "reward_std": 2.1979379653930664, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.466801643371582, "rewards/kidney_reward/std": 0.7594356536865234, "rewards/length2tails_reward/mean": 0.7021905183792114, "rewards/length2tails_reward/std": 0.3625527024269104, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.398256778717041, "rewards/thermo_reward/std": 1.1553550958633423, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.40625, "completions/mean_terminated_length": 272.40625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09341870713979006, "epoch": 2.146, "frac_reward_zero_std": 0.0, "grad_norm": 0.10488131642341614, "learning_rate": 1.2573737343653023e-06, "loss": -0.0048, "num_tokens": 9347211.0, "reward": 13.0523042678833, "reward_std": 2.726144313812256, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.3864073753356934, "rewards/kidney_reward/std": 0.9536484479904175, "rewards/length2tails_reward/mean": 0.7425001859664917, "rewards/length2tails_reward/std": 0.32825616002082825, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1304609775543213, "rewards/thermo_reward/std": 1.8517941236495972, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.90625, "completions/mean_terminated_length": 272.90625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.10365541558712721, "epoch": 2.148, "frac_reward_zero_std": 0.0, "grad_norm": 0.22300338745117188, "learning_rate": 1.2561344380159627e-06, "loss": -0.0038, "num_tokens": 9355976.0, "reward": 13.19325065612793, "reward_std": 2.598496913909912, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.4012303352355957, "rewards/kidney_reward/std": 0.8964017033576965, "rewards/length2tails_reward/mean": 0.8006282448768616, "rewards/length2tails_reward/std": 0.25021499395370483, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3082823753356934, "rewards/thermo_reward/std": 1.4561070203781128, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.28125, "completions/mean_terminated_length": 273.28125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08755374420434237, "epoch": 2.15, "frac_reward_zero_std": 0.0, "grad_norm": 0.07114322483539581, "learning_rate": 1.2548947205178962e-06, "loss": -0.0012, "num_tokens": 9364753.0, "reward": 13.618678092956543, "reward_std": 0.9257692694664001, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8494964838027954, "rewards/length2tails_reward/std": 0.16474813222885132, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4460642337799072, "rewards/thermo_reward/std": 0.9196395874023438, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.28125, "completions/mean_terminated_length": 272.28125, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "entropy": 0.0993583258241415, "epoch": 2.152, "frac_reward_zero_std": 0.0, "grad_norm": 0.07258055359125137, "learning_rate": 1.2536545839095072e-06, "loss": -0.0037, "num_tokens": 9373498.0, "reward": 12.512422561645508, "reward_std": 5.39253044128418, "rewards/fitness_reward/mean": 6.981512546539307, "rewards/fitness_reward/std": 2.147751569747925, "rewards/kidney_reward/mean": 2.2829885482788086, "rewards/kidney_reward/std": 1.4037550687789917, "rewards/length2tails_reward/mean": 0.7973028421401978, "rewards/length2tails_reward/std": 0.27729007601737976, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.068190813064575, "rewards/thermo_reward/std": 2.207812786102295, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.78125, "completions/mean_terminated_length": 272.78125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.09348468575626612, "epoch": 2.154, "frac_reward_zero_std": 0.0, "grad_norm": 0.08929450809955597, "learning_rate": 1.2524140302298891e-06, "loss": -0.004, "num_tokens": 9382259.0, "reward": 13.80868911743164, "reward_std": 0.526460587978363, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.788748025894165, "rewards/length2tails_reward/std": 0.27583274245262146, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.5, "completions/mean_terminated_length": 271.5, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.0946068437770009, "epoch": 2.156, "frac_reward_zero_std": 0.0, "grad_norm": 0.0643715113401413, "learning_rate": 1.2511730615188204e-06, "loss": -0.006, "num_tokens": 9390979.0, "reward": 12.824613571166992, "reward_std": 2.739434242248535, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.348367214202881, "rewards/kidney_reward/std": 0.9928209781646729, "rewards/length2tails_reward/mean": 0.7526307106018066, "rewards/length2tails_reward/std": 0.367491751909256, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.939798355102539, "rewards/thermo_reward/std": 1.7655022144317627, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.21875, "completions/mean_terminated_length": 272.21875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10376318357884884, "epoch": 2.158, "frac_reward_zero_std": 0.0, "grad_norm": 0.16339758038520813, "learning_rate": 1.249931679816762e-06, "loss": 0.0051, "num_tokens": 9399722.0, "reward": 11.997785568237305, "reward_std": 6.722228527069092, "rewards/fitness_reward/mean": 6.633974075317383, "rewards/fitness_reward/std": 2.8792197704315186, "rewards/kidney_reward/mean": 2.1771368980407715, "rewards/kidney_reward/std": 1.6707159280776978, "rewards/length2tails_reward/mean": 0.78455650806427, "rewards/length2tails_reward/std": 0.2624914050102234, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0082194805145264, "rewards/thermo_reward/std": 2.290698528289795, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.84375, "completions/mean_terminated_length": 270.84375, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "entropy": 0.09403804317116737, "epoch": 2.16, "frac_reward_zero_std": 0.0, "grad_norm": 0.09548342227935791, "learning_rate": 1.2486898871648551e-06, "loss": -0.0067, "num_tokens": 9408421.0, "reward": 13.616037368774414, "reward_std": 1.0823116302490234, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.761284589767456, "rewards/length2tails_reward/std": 0.2536487877368927, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.452244281768799, "rewards/thermo_reward/std": 1.0749342441558838, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.125, "completions/mean_terminated_length": 271.125, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.09731936268508434, "epoch": 2.162, "frac_reward_zero_std": 0.0, "grad_norm": 0.10964816063642502, "learning_rate": 1.2474476856049143e-06, "loss": -0.0017, "num_tokens": 9417129.0, "reward": 13.43002986907959, "reward_std": 1.321260929107666, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7619011402130127, "rewards/length2tails_reward/std": 0.29861509799957275, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.293534517288208, "rewards/thermo_reward/std": 1.2125825881958008, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.15625, "completions/mean_terminated_length": 273.15625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08726116176694632, "epoch": 2.164, "frac_reward_zero_std": 0.0, "grad_norm": 0.07873732596635818, "learning_rate": 1.2462050771794292e-06, "loss": -0.0035, "num_tokens": 9425902.0, "reward": 13.51996898651123, "reward_std": 2.503188133239746, "rewards/fitness_reward/mean": 6.988970756530762, "rewards/fitness_reward/std": 2.1055612564086914, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8259742259979248, "rewards/length2tails_reward/std": 0.24308039247989655, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.96875, "completions/mean_terminated_length": 271.96875, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "entropy": 0.09302136953920126, "epoch": 2.166, "frac_reward_zero_std": 0.0, "grad_norm": 0.13266915082931519, "learning_rate": 1.2449620639315567e-06, "loss": 0.0018, "num_tokens": 9434637.0, "reward": 13.589839935302734, "reward_std": 1.0492033958435059, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8250849843025208, "rewards/length2tails_reward/std": 0.22939039766788483, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4470252990722656, "rewards/thermo_reward/std": 0.9150979518890381, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.71875, "completions/mean_terminated_length": 272.71875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09248879831284285, "epoch": 2.168, "frac_reward_zero_std": 0.0, "grad_norm": 0.1034470871090889, "learning_rate": 1.2437186479051198e-06, "loss": -0.0023, "num_tokens": 9443396.0, "reward": 13.570369720458984, "reward_std": 1.1367368698120117, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.5380520820617676, "rewards/kidney_reward/std": 0.500221848487854, "rewards/length2tails_reward/mean": 0.7879185676574707, "rewards/length2tails_reward/std": 0.24495474994182587, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5498504638671875, "rewards/thermo_reward/std": 0.5360844731330872, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.59375, "completions/mean_terminated_length": 271.59375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09990228898823261, "epoch": 2.17, "frac_reward_zero_std": 0.0, "grad_norm": 0.1326744556427002, "learning_rate": 1.2424748311446038e-06, "loss": -0.0034, "num_tokens": 9452119.0, "reward": 13.452388763427734, "reward_std": 1.3028936386108398, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.715055525302887, "rewards/length2tails_reward/std": 0.2911849617958069, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.320578098297119, "rewards/thermo_reward/std": 1.2647727727890015, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.71875, "completions/mean_terminated_length": 271.71875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09053017944097519, "epoch": 2.172, "frac_reward_zero_std": 0.0, "grad_norm": 0.09604272991418839, "learning_rate": 1.2412306156951524e-06, "loss": -0.0044, "num_tokens": 9460846.0, "reward": 13.911317825317383, "reward_std": 0.3229130804538727, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7437255382537842, "rewards/length2tails_reward/std": 0.24738410115242004, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.28125, "completions/mean_terminated_length": 271.28125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10133631248027086, "epoch": 2.174, "frac_reward_zero_std": 0.0, "grad_norm": 0.070340096950531, "learning_rate": 1.2399860036025658e-06, "loss": -0.0039, "num_tokens": 9469559.0, "reward": 12.913904190063477, "reward_std": 3.856182813644409, "rewards/fitness_reward/mean": 7.050872802734375, "rewards/fitness_reward/std": 1.755391001701355, "rewards/kidney_reward/mean": 2.4534530639648438, "rewards/kidney_reward/std": 0.8335072994232178, "rewards/length2tails_reward/mean": 0.6835463047027588, "rewards/length2tails_reward/std": 0.3237307369709015, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.241224527359009, "rewards/thermo_reward/std": 1.4365715980529785, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 270.6875, "completions/mean_terminated_length": 270.6875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0889931432902813, "epoch": 2.176, "frac_reward_zero_std": 0.0, "grad_norm": 0.06201954558491707, "learning_rate": 1.2387409969132959e-06, "loss": -0.0045, "num_tokens": 9478253.0, "reward": 13.459275245666504, "reward_std": 3.005136013031006, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.5390896797180176, "rewards/kidney_reward/std": 0.49435171484947205, "rewards/length2tails_reward/mean": 0.7147153615951538, "rewards/length2tails_reward/std": 0.2847312092781067, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.69566011428833, "rewards/thermo_reward/std": 0.7545809149742126, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.5625, "completions/mean_terminated_length": 272.5625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09669537469744682, "epoch": 2.178, "frac_reward_zero_std": 0.0, "grad_norm": 0.13551564514636993, "learning_rate": 1.237495597674443e-06, "loss": 0.001, "num_tokens": 9487007.0, "reward": 12.33701229095459, "reward_std": 4.668380260467529, "rewards/fitness_reward/mean": 6.98996114730835, "rewards/fitness_reward/std": 1.793858528137207, "rewards/kidney_reward/mean": 2.2286503314971924, "rewards/kidney_reward/std": 1.217890977859497, "rewards/length2tails_reward/mean": 0.8198975324630737, "rewards/length2tails_reward/std": 0.24321754276752472, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.936410903930664, "rewards/thermo_reward/std": 2.0050437450408936, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.65625, "completions/mean_terminated_length": 271.65625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09151883330196142, "epoch": 2.18, "frac_reward_zero_std": 0.0, "grad_norm": 0.08709075301885605, "learning_rate": 1.236249807933753e-06, "loss": -0.0027, "num_tokens": 9495732.0, "reward": 12.781303405761719, "reward_std": 2.283271551132202, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.354283332824707, "rewards/kidney_reward/std": 0.8840342164039612, "rewards/length2tails_reward/mean": 0.7024877071380615, "rewards/length2tails_reward/std": 0.3341656029224396, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.953094720840454, "rewards/thermo_reward/std": 1.5103843212127686, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.5, "completions/mean_terminated_length": 273.5, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08445745427161455, "epoch": 2.182, "frac_reward_zero_std": 0.0, "grad_norm": 0.17772817611694336, "learning_rate": 1.2350036297396152e-06, "loss": -0.0015, "num_tokens": 9504516.0, "reward": 13.544971466064453, "reward_std": 0.7186257839202881, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8183159232139587, "rewards/length2tails_reward/std": 0.24614040553569794, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4301929473876953, "rewards/thermo_reward/std": 0.6010706424713135, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.40625, "completions/mean_terminated_length": 271.40625, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "entropy": 0.09109118394553661, "epoch": 2.184, "frac_reward_zero_std": 0.0, "grad_norm": 0.08223754912614822, "learning_rate": 1.2337570651410553e-06, "loss": -0.0019, "num_tokens": 9513233.0, "reward": 13.33891773223877, "reward_std": 3.061128616333008, "rewards/fitness_reward/mean": 6.987524509429932, "rewards/fitness_reward/std": 2.113743305206299, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8037495613098145, "rewards/length2tails_reward/std": 0.26475465297698975, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.571897029876709, "rewards/thermo_reward/std": 0.844916045665741, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.6875, "completions/mean_terminated_length": 272.6875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09510310553014278, "epoch": 2.186, "frac_reward_zero_std": 0.0, "grad_norm": 0.39092686772346497, "learning_rate": 1.2325101161877363e-06, "loss": 0.0056, "num_tokens": 9521991.0, "reward": 12.502074241638184, "reward_std": 5.3984293937683105, "rewards/fitness_reward/mean": 6.960834503173828, "rewards/fitness_reward/std": 2.26472544670105, "rewards/kidney_reward/mean": 2.318307399749756, "rewards/kidney_reward/std": 1.439921259880066, "rewards/length2tails_reward/mean": 0.8009666204452515, "rewards/length2tails_reward/std": 0.23079052567481995, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0428361892700195, "rewards/thermo_reward/std": 1.952774167060852, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.15625, "completions/mean_terminated_length": 272.15625, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "entropy": 0.10605480056256056, "epoch": 2.188, "frac_reward_zero_std": 0.0, "grad_norm": 0.13017325103282928, "learning_rate": 1.2312627849299522e-06, "loss": 0.0035, "num_tokens": 9530732.0, "reward": 13.25168228149414, "reward_std": 1.6000559329986572, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.8000182509422302, "rewards/length2tails_reward/std": 0.24549025297164917, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1660938262939453, "rewards/thermo_reward/std": 1.3578734397888184, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.75, "completions/mean_terminated_length": 272.75, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "entropy": 0.10298249125480652, "epoch": 2.19, "frac_reward_zero_std": 0.0, "grad_norm": 0.10652635991573334, "learning_rate": 1.2300150734186257e-06, "loss": 0.0029, "num_tokens": 9539492.0, "reward": 13.481754302978516, "reward_std": 1.0265034437179565, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8570235967636108, "rewards/length2tails_reward/std": 0.2003658562898636, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3357458114624023, "rewards/thermo_reward/std": 0.9043408632278442, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.46875, "completions/mean_terminated_length": 272.46875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.1003633365035057, "epoch": 2.192, "frac_reward_zero_std": 0.0, "grad_norm": 0.10922206193208694, "learning_rate": 1.2287669837053055e-06, "loss": 0.0032, "num_tokens": 9548243.0, "reward": 13.641023635864258, "reward_std": 1.025151014328003, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7482881546020508, "rewards/length2tails_reward/std": 0.3050001561641693, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5058889389038086, "rewards/thermo_reward/std": 0.9933370351791382, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 285.4375, "completions/mean_terminated_length": 285.4375, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "entropy": 0.09042511694133282, "epoch": 2.194, "frac_reward_zero_std": 0.0, "grad_norm": 4.114542007446289, "learning_rate": 1.2275185178421606e-06, "loss": 0.2213, "num_tokens": 9557409.0, "reward": 13.585801124572754, "reward_std": 1.15471351146698, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8321831226348877, "rewards/length2tails_reward/std": 0.24415747821331024, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.469636917114258, "rewards/thermo_reward/std": 0.9873608350753784, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.84375, "completions/mean_terminated_length": 273.84375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10123864654451609, "epoch": 2.196, "frac_reward_zero_std": 0.0, "grad_norm": 0.09772907197475433, "learning_rate": 1.2262696778819799e-06, "loss": -0.0028, "num_tokens": 9566204.0, "reward": 13.775348663330078, "reward_std": 0.5588542222976685, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8542090654373169, "rewards/length2tails_reward/std": 0.23235513269901276, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.59375, "completions/mean_terminated_length": 271.59375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08716964721679688, "epoch": 2.198, "frac_reward_zero_std": 0.0, "grad_norm": 0.12908770143985748, "learning_rate": 1.2250204658781673e-06, "loss": 0.0039, "num_tokens": 9574927.0, "reward": 13.350052833557129, "reward_std": 1.4368737936019897, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.708900511264801, "rewards/length2tails_reward/std": 0.2893790602684021, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2188572883605957, "rewards/thermo_reward/std": 1.3603311777114868, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.3125, "completions/mean_terminated_length": 273.3125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.10285337548702955, "epoch": 2.2, "frac_reward_zero_std": 0.0, "grad_norm": 0.0842437818646431, "learning_rate": 1.2237708838847373e-06, "loss": -0.0023, "num_tokens": 9583705.0, "reward": 13.880155563354492, "reward_std": 0.3787178099155426, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8309652805328369, "rewards/length2tails_reward/std": 0.23853476345539093, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.21875, "completions/mean_terminated_length": 271.21875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09618408605456352, "epoch": 2.202, "frac_reward_zero_std": 0.0, "grad_norm": 0.14736229181289673, "learning_rate": 1.2225209339563143e-06, "loss": -0.0039, "num_tokens": 9592416.0, "reward": 13.134501457214355, "reward_std": 2.643918514251709, "rewards/fitness_reward/mean": 7.004948616027832, "rewards/fitness_reward/std": 2.0151779651641846, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7189492583274841, "rewards/length2tails_reward/std": 0.2612570822238922, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3585379123687744, "rewards/thermo_reward/std": 1.256617546081543, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.5, "completions/mean_terminated_length": 272.5, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09677848406136036, "epoch": 2.204, "frac_reward_zero_std": 0.0, "grad_norm": 0.13801175355911255, "learning_rate": 1.2212706181481266e-06, "loss": -0.0007, "num_tokens": 9601168.0, "reward": 13.477926254272461, "reward_std": 1.2938647270202637, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7869364619255066, "rewards/length2tails_reward/std": 0.24135585129261017, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3389272689819336, "rewards/thermo_reward/std": 1.1878148317337036, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.9375, "completions/mean_terminated_length": 271.9375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09158475417643785, "epoch": 2.206, "frac_reward_zero_std": 0.0, "grad_norm": 0.25704577565193176, "learning_rate": 1.2200199385160039e-06, "loss": -0.0023, "num_tokens": 9609902.0, "reward": 12.840763092041016, "reward_std": 4.182301998138428, "rewards/fitness_reward/mean": 7.025365829467773, "rewards/fitness_reward/std": 1.8996788263320923, "rewards/kidney_reward/mean": 2.440293073654175, "rewards/kidney_reward/std": 0.9067621827125549, "rewards/length2tails_reward/mean": 0.73990797996521, "rewards/length2tails_reward/std": 0.2573379576206207, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.201112747192383, "rewards/thermo_reward/std": 1.6995855569839478, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.46875, "completions/mean_terminated_length": 272.46875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08908624108880758, "epoch": 2.208, "frac_reward_zero_std": 0.0, "grad_norm": 0.1501232534646988, "learning_rate": 1.2187688971163752e-06, "loss": -0.0052, "num_tokens": 9618653.0, "reward": 13.237232208251953, "reward_std": 3.0937626361846924, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.5390896797180176, "rewards/kidney_reward/std": 0.49435171484947205, "rewards/length2tails_reward/mean": 0.7692506313323975, "rewards/length2tails_reward/std": 0.28168579936027527, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.46816349029541, "rewards/thermo_reward/std": 1.1310467720031738, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08702242281287909, "epoch": 2.21, "frac_reward_zero_std": 0.0, "grad_norm": 0.0907735675573349, "learning_rate": 1.2175174960062635e-06, "loss": -0.0029, "num_tokens": 9627369.0, "reward": 13.019981384277344, "reward_std": 3.489938259124756, "rewards/fitness_reward/mean": 7.005073547363281, "rewards/fitness_reward/std": 2.014472723007202, "rewards/kidney_reward/mean": 2.499762773513794, "rewards/kidney_reward/std": 0.5781379342079163, "rewards/length2tails_reward/mean": 0.6592178344726562, "rewards/length2tails_reward/std": 0.3506201207637787, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3492231369018555, "rewards/thermo_reward/std": 1.0179513692855835, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.78125, "completions/mean_terminated_length": 271.78125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09034999832510948, "epoch": 2.212, "frac_reward_zero_std": 0.0, "grad_norm": 0.35235831141471863, "learning_rate": 1.2162657372432833e-06, "loss": -0.0027, "num_tokens": 9636098.0, "reward": 13.280094146728516, "reward_std": 2.321930408477783, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.4449310302734375, "rewards/kidney_reward/std": 0.8809229731559753, "rewards/length2tails_reward/mean": 0.7409297227859497, "rewards/length2tails_reward/std": 0.29801851511001587, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3573951721191406, "rewards/thermo_reward/std": 1.1669204235076904, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.96875, "completions/mean_terminated_length": 272.96875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09247970208525658, "epoch": 2.214, "frac_reward_zero_std": 0.0, "grad_norm": 0.09246492385864258, "learning_rate": 1.215013622885638e-06, "loss": -0.0044, "num_tokens": 9644865.0, "reward": 13.759563446044922, "reward_std": 0.5116779804229736, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8216274976730347, "rewards/length2tails_reward/std": 0.18907445669174194, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09686029702425003, "epoch": 2.216, "frac_reward_zero_std": 0.0, "grad_norm": 0.10450281202793121, "learning_rate": 1.2137611549921145e-06, "loss": 0.0029, "num_tokens": 9653613.0, "reward": 13.73473072052002, "reward_std": 0.8503785729408264, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7688559293746948, "rewards/length2tails_reward/std": 0.2738942503929138, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.570180654525757, "rewards/thermo_reward/std": 0.8536139726638794, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.4375, "completions/mean_terminated_length": 271.4375, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "entropy": 0.09789775125682354, "epoch": 2.218, "frac_reward_zero_std": 0.0, "grad_norm": 0.05906645581126213, "learning_rate": 1.2125083356220816e-06, "loss": -0.0029, "num_tokens": 9662331.0, "reward": 13.155956268310547, "reward_std": 3.0850579738616943, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.5390896797180176, "rewards/kidney_reward/std": 0.49435171484947205, "rewards/length2tails_reward/mean": 0.7530629634857178, "rewards/length2tails_reward/std": 0.3211040496826172, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.388507127761841, "rewards/thermo_reward/std": 1.147755742073059, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.0625, "completions/mean_terminated_length": 272.0625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09061425412073731, "epoch": 2.22, "frac_reward_zero_std": 0.0, "grad_norm": 0.1054484099149704, "learning_rate": 1.2112551668354861e-06, "loss": -0.0054, "num_tokens": 9671069.0, "reward": 13.273934364318848, "reward_std": 2.2469191551208496, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.4686717987060547, "rewards/kidney_reward/std": 0.749081552028656, "rewards/length2tails_reward/mean": 0.749596118927002, "rewards/length2tails_reward/std": 0.2696993947029114, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.326627254486084, "rewards/thermo_reward/std": 1.375333547592163, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 270.9375, "completions/mean_terminated_length": 270.9375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08864146191626787, "epoch": 2.222, "frac_reward_zero_std": 0.0, "grad_norm": 0.06858351826667786, "learning_rate": 1.2100016506928491e-06, "loss": -0.0034, "num_tokens": 9679771.0, "reward": 13.456001281738281, "reward_std": 1.3390583992004395, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.6750684380531311, "rewards/length2tails_reward/std": 0.3238985538482666, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.300830364227295, "rewards/thermo_reward/std": 1.3286960124969482, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.53125, "completions/mean_terminated_length": 271.53125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.0936619434505701, "epoch": 2.224, "frac_reward_zero_std": 0.0, "grad_norm": 0.1211986243724823, "learning_rate": 1.2087477892552633e-06, "loss": -0.0002, "num_tokens": 9688492.0, "reward": 13.617203712463379, "reward_std": 0.6876401305198669, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7429279088973999, "rewards/length2tails_reward/std": 0.27183738350868225, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5099644660949707, "rewards/thermo_reward/std": 0.5615194439888, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.90625, "completions/mean_terminated_length": 270.90625, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.10044244676828384, "epoch": 2.226, "frac_reward_zero_std": 0.0, "grad_norm": 0.11018425226211548, "learning_rate": 1.2074935845843885e-06, "loss": 0.0028, "num_tokens": 9697193.0, "reward": 13.161798477172852, "reward_std": 2.0913569927215576, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4495739936828613, "rewards/kidney_reward/std": 0.5821607708930969, "rewards/length2tails_reward/mean": 0.7702677249908447, "rewards/length2tails_reward/std": 0.27362877130508423, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.174013614654541, "rewards/thermo_reward/std": 1.5860369205474854, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 274.125, "completions/mean_terminated_length": 274.125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09556818194687366, "epoch": 2.228, "frac_reward_zero_std": 0.0, "grad_norm": 0.8129578828811646, "learning_rate": 1.206239038742449e-06, "loss": -0.0042, "num_tokens": 9705997.0, "reward": 13.08530044555664, "reward_std": 2.681222915649414, "rewards/fitness_reward/mean": 7.051130294799805, "rewards/fitness_reward/std": 1.753933072090149, "rewards/kidney_reward/mean": 2.456324815750122, "rewards/kidney_reward/std": 0.5482165813446045, "rewards/length2tails_reward/mean": 0.8678853511810303, "rewards/length2tails_reward/std": 0.2075439840555191, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3910574913024902, "rewards/thermo_reward/std": 1.0013716220855713, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.0, "completions/mean_terminated_length": 272.0, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09249386843293905, "epoch": 2.23, "frac_reward_zero_std": 0.0, "grad_norm": 0.10124978423118591, "learning_rate": 1.2049841537922305e-06, "loss": -0.0076, "num_tokens": 9714733.0, "reward": 12.888282775878906, "reward_std": 2.722522735595703, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.485222816467285, "rewards/kidney_reward/std": 0.6577568650245667, "rewards/length2tails_reward/mean": 0.741077184677124, "rewards/length2tails_reward/std": 0.33156618475914, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.925276517868042, "rewards/thermo_reward/std": 2.0026774406433105, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.5625, "completions/mean_terminated_length": 271.5625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09432558994740248, "epoch": 2.232, "frac_reward_zero_std": 0.0, "grad_norm": 0.1653272807598114, "learning_rate": 1.2037289317970756e-06, "loss": 0.0041, "num_tokens": 9723455.0, "reward": 11.966596603393555, "reward_std": 6.564814567565918, "rewards/fitness_reward/mean": 6.64857292175293, "rewards/fitness_reward/std": 2.827414035797119, "rewards/kidney_reward/mean": 2.1993813514709473, "rewards/kidney_reward/std": 1.6382735967636108, "rewards/length2tails_reward/mean": 0.7423949241638184, "rewards/length2tails_reward/std": 0.2909180223941803, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9444031715393066, "rewards/thermo_reward/std": 2.245339870452881, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.09375, "completions/mean_terminated_length": 273.09375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09407586511224508, "epoch": 2.234, "frac_reward_zero_std": 0.0, "grad_norm": 0.21728630363941193, "learning_rate": 1.2024733748208818e-06, "loss": -0.002, "num_tokens": 9732226.0, "reward": 13.806081771850586, "reward_std": 0.5215768218040466, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7626720070838928, "rewards/length2tails_reward/std": 0.3082931637763977, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 274.03125, "completions/mean_terminated_length": 274.03125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.10200390871614218, "epoch": 2.2359999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.12265169620513916, "learning_rate": 1.201217484928097e-06, "loss": -0.0023, "num_tokens": 9741027.0, "reward": 13.404191970825195, "reward_std": 2.496299982070923, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.453716278076172, "rewards/kidney_reward/std": 0.9772971272468567, "rewards/length2tails_reward/mean": 0.8660367727279663, "rewards/length2tails_reward/std": 0.19639356434345245, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.402686834335327, "rewards/thermo_reward/std": 1.5408576726913452, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.15625, "completions/mean_terminated_length": 272.15625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09110501315444708, "epoch": 2.238, "frac_reward_zero_std": 0.0, "grad_norm": 2.0856006145477295, "learning_rate": 1.1999612641837166e-06, "loss": -0.0, "num_tokens": 9749768.0, "reward": 13.267995834350586, "reward_std": 1.4361052513122559, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7791933417320251, "rewards/length2tails_reward/std": 0.24449069797992706, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1297717094421387, "rewards/thermo_reward/std": 1.3451257944107056, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.03125, "completions/mean_terminated_length": 272.03125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08734600804746151, "epoch": 2.24, "frac_reward_zero_std": 0.0, "grad_norm": 0.08932296186685562, "learning_rate": 1.1987047146532799e-06, "loss": -0.0069, "num_tokens": 9758505.0, "reward": 13.060445785522461, "reward_std": 2.2628417015075684, "rewards/fitness_reward/mean": 7.188657760620117, "rewards/fitness_reward/std": 0.7179933190345764, "rewards/kidney_reward/mean": 2.508427143096924, "rewards/kidney_reward/std": 0.5310735702514648, "rewards/length2tails_reward/mean": 0.722597599029541, "rewards/length2tails_reward/std": 0.30088114738464355, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1911020278930664, "rewards/thermo_reward/std": 1.3022575378417969, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.34375, "completions/mean_terminated_length": 273.34375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10040435381233692, "epoch": 2.242, "frac_reward_zero_std": 0.0, "grad_norm": 0.15858429670333862, "learning_rate": 1.197447838402867e-06, "loss": 0.0067, "num_tokens": 9767284.0, "reward": 13.367097854614258, "reward_std": 1.0577739477157593, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7829563021659851, "rewards/length2tails_reward/std": 0.3015859127044678, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2558560371398926, "rewards/thermo_reward/std": 0.9143465757369995, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.09375, "completions/mean_terminated_length": 273.09375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09452992957085371, "epoch": 2.2439999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.07336730509996414, "learning_rate": 1.196190637499095e-06, "loss": -0.0041, "num_tokens": 9776055.0, "reward": 13.433874130249023, "reward_std": 1.4296433925628662, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4767003059387207, "rewards/kidney_reward/std": 0.5699067711830139, "rewards/length2tails_reward/mean": 0.8327226638793945, "rewards/length2tails_reward/std": 0.23930272459983826, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.412717342376709, "rewards/thermo_reward/std": 0.9000661373138428, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.40625, "completions/mean_terminated_length": 273.40625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09509277995675802, "epoch": 2.246, "frac_reward_zero_std": 0.0, "grad_norm": 0.1498255878686905, "learning_rate": 1.1949331140091152e-06, "loss": -0.0019, "num_tokens": 9784836.0, "reward": 12.87520980834961, "reward_std": 2.7726833820343018, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.452592670917511, "rewards/kidney_reward/mean": 2.4142074584960938, "rewards/kidney_reward/std": 0.7665389180183411, "rewards/length2tails_reward/mean": 0.8063620328903198, "rewards/length2tails_reward/std": 0.2866155505180359, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0341992378234863, "rewards/thermo_reward/std": 1.8778098821640015, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 272.625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09982185065746307, "epoch": 2.248, "frac_reward_zero_std": 0.0, "grad_norm": 0.21547384560108185, "learning_rate": 1.1936752700006086e-06, "loss": 0.0022, "num_tokens": 9793592.0, "reward": 13.266108512878418, "reward_std": 1.8420783281326294, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.4757165908813477, "rewards/kidney_reward/std": 0.5750632286071777, "rewards/length2tails_reward/mean": 0.7889871001243591, "rewards/length2tails_reward/std": 0.2755734622478485, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3078179359436035, "rewards/thermo_reward/std": 1.0306305885314941, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.03125, "completions/mean_terminated_length": 273.03125, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.09938167594373226, "epoch": 2.25, "frac_reward_zero_std": 0.0, "grad_norm": 0.10929016023874283, "learning_rate": 1.1924171075417836e-06, "loss": 0.0005, "num_tokens": 9802361.0, "reward": 13.469961166381836, "reward_std": 1.7545742988586426, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4885220527648926, "rewards/kidney_reward/std": 0.6396322250366211, "rewards/length2tails_reward/mean": 0.8194814324378967, "rewards/length2tails_reward/std": 0.20707230269908905, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.438304901123047, "rewards/thermo_reward/std": 1.1463873386383057, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.5, "completions/mean_terminated_length": 272.5, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09620609972625971, "epoch": 2.252, "frac_reward_zero_std": 0.0, "grad_norm": 0.10473023355007172, "learning_rate": 1.1911586287013725e-06, "loss": -0.002, "num_tokens": 9811113.0, "reward": 12.45807933807373, "reward_std": 4.429269313812256, "rewards/fitness_reward/mean": 7.04914665222168, "rewards/fitness_reward/std": 1.7651554346084595, "rewards/kidney_reward/mean": 2.2511706352233887, "rewards/kidney_reward/std": 1.2130392789840698, "rewards/length2tails_reward/mean": 0.7860097885131836, "rewards/length2tails_reward/std": 0.25211507081985474, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.979160785675049, "rewards/thermo_reward/std": 2.074921131134033, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.28125, "completions/mean_terminated_length": 273.28125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09645435586571693, "epoch": 2.254, "frac_reward_zero_std": 0.0, "grad_norm": 0.09141551703214645, "learning_rate": 1.1898998355486272e-06, "loss": -0.0055, "num_tokens": 9819890.0, "reward": 13.026348114013672, "reward_std": 4.3629889488220215, "rewards/fitness_reward/mean": 7.019981384277344, "rewards/fitness_reward/std": 1.930139183998108, "rewards/kidney_reward/mean": 2.436849355697632, "rewards/kidney_reward/std": 1.0727108716964722, "rewards/length2tails_reward/mean": 0.7998782396316528, "rewards/length2tails_reward/std": 0.2706691324710846, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.389529228210449, "rewards/thermo_reward/std": 1.4026683568954468, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.96875, "completions/mean_terminated_length": 272.96875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09469268657267094, "epoch": 2.2560000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.08354964852333069, "learning_rate": 1.188640730153317e-06, "loss": 0.0018, "num_tokens": 9828657.0, "reward": 13.520135879516602, "reward_std": 1.2767798900604248, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8036321401596069, "rewards/length2tails_reward/std": 0.24694252014160156, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4068260192871094, "rewards/thermo_reward/std": 1.1119424104690552, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.1023802924901247, "epoch": 2.258, "frac_reward_zero_std": 0.0, "grad_norm": 0.13477559387683868, "learning_rate": 1.1873813145857248e-06, "loss": 0.0004, "num_tokens": 9837401.0, "reward": 13.01758861541748, "reward_std": 2.709385633468628, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.45963191986084, "rewards/kidney_reward/std": 0.6603246331214905, "rewards/length2tails_reward/mean": 0.7318527698516846, "rewards/length2tails_reward/std": 0.2912712097167969, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.081094741821289, "rewards/thermo_reward/std": 1.8870495557785034, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.46875, "completions/mean_terminated_length": 273.46875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09059114195406437, "epoch": 2.26, "frac_reward_zero_std": 0.0, "grad_norm": 0.13202516734600067, "learning_rate": 1.186121590916642e-06, "loss": -0.005, "num_tokens": 9846184.0, "reward": 13.627695083618164, "reward_std": 1.1517388820648193, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.5113868713378906, "rewards/kidney_reward/std": 0.5150805115699768, "rewards/length2tails_reward/mean": 0.8301060199737549, "rewards/length2tails_reward/std": 0.23349504172801971, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.84375, "completions/mean_terminated_length": 271.84375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08616469334810972, "epoch": 2.262, "frac_reward_zero_std": 0.0, "grad_norm": 0.07374259829521179, "learning_rate": 1.1848615612173686e-06, "loss": -0.0039, "num_tokens": 9854915.0, "reward": 13.142324447631836, "reward_std": 3.076014280319214, "rewards/fitness_reward/mean": 7.0498809814453125, "rewards/fitness_reward/std": 1.761002540588379, "rewards/kidney_reward/mean": 2.5088601112365723, "rewards/kidney_reward/std": 0.5287303328514099, "rewards/length2tails_reward/mean": 0.730111837387085, "rewards/length2tails_reward/std": 0.300480455160141, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.410572052001953, "rewards/thermo_reward/std": 0.9098656177520752, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0960354171693325, "epoch": 2.2640000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.21694089472293854, "learning_rate": 1.1836012275597065e-06, "loss": 0.0005, "num_tokens": 9863643.0, "reward": 12.541905403137207, "reward_std": 4.093541622161865, "rewards/fitness_reward/mean": 7.038051605224609, "rewards/fitness_reward/std": 1.8279199600219727, "rewards/kidney_reward/mean": 2.418659210205078, "rewards/kidney_reward/std": 0.8826969265937805, "rewards/length2tails_reward/mean": 0.7205191850662231, "rewards/length2tails_reward/std": 0.29001757502555847, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9131431579589844, "rewards/thermo_reward/std": 1.8318647146224976, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.28125, "completions/mean_terminated_length": 272.28125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09206268656998873, "epoch": 2.266, "frac_reward_zero_std": 0.0, "grad_norm": 0.25878921151161194, "learning_rate": 1.1823405920159574e-06, "loss": -0.0009, "num_tokens": 9872388.0, "reward": 13.484138488769531, "reward_std": 1.8616029024124146, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5085091590881348, "rewards/kidney_reward/std": 0.5306293368339539, "rewards/length2tails_reward/mean": 0.7648021578788757, "rewards/length2tails_reward/std": 0.28547853231430054, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4379642009735107, "rewards/thermo_reward/std": 1.3502850532531738, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 271.875, "completions/mean_terminated_length": 271.875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09104644972831011, "epoch": 2.268, "frac_reward_zero_std": 0.0, "grad_norm": 0.12931600213050842, "learning_rate": 1.1810796566589206e-06, "loss": -0.0011, "num_tokens": 9881120.0, "reward": 13.61038589477539, "reward_std": 1.942013144493103, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5073652267456055, "rewards/kidney_reward/std": 0.673812985420227, "rewards/length2tails_reward/mean": 0.7605490684509277, "rewards/length2tails_reward/std": 0.29428207874298096, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5657811164855957, "rewards/thermo_reward/std": 1.2764577865600586, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 268.5, "completions/mean_terminated_length": 268.5, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.0982245123013854, "epoch": 2.27, "frac_reward_zero_std": 0.0, "grad_norm": 0.6288898587226868, "learning_rate": 1.1798184235618866e-06, "loss": -0.0922, "num_tokens": 9889744.0, "reward": 12.440940856933594, "reward_std": 5.082667827606201, "rewards/fitness_reward/mean": 6.977675437927246, "rewards/fitness_reward/std": 2.1694583892822266, "rewards/kidney_reward/mean": 2.3234381675720215, "rewards/kidney_reward/std": 1.2618170976638794, "rewards/length2tails_reward/mean": 0.8122677803039551, "rewards/length2tails_reward/std": 0.28319400548934937, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9586005210876465, "rewards/thermo_reward/std": 2.035763740539551, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 266.0, "completions/mean_terminated_length": 266.0, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.09654929209500551, "epoch": 2.2720000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.8640089631080627, "learning_rate": 1.1785568947986366e-06, "loss": -0.1303, "num_tokens": 9898288.0, "reward": 12.790825843811035, "reward_std": 4.592556953430176, "rewards/fitness_reward/mean": 6.991696357727051, "rewards/fitness_reward/std": 2.0901432037353516, "rewards/kidney_reward/mean": 2.3974647521972656, "rewards/kidney_reward/std": 1.146229863166809, "rewards/length2tails_reward/mean": 0.7754591703414917, "rewards/length2tails_reward/std": 0.2671588361263275, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.224119186401367, "rewards/thermo_reward/std": 1.446903944015503, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.9375, "completions/mean_terminated_length": 272.9375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09294535405933857, "epoch": 2.274, "frac_reward_zero_std": 0.0, "grad_norm": 0.12001582980155945, "learning_rate": 1.177295072443438e-06, "loss": -0.005, "num_tokens": 9907054.0, "reward": 13.032320022583008, "reward_std": 3.873511791229248, "rewards/fitness_reward/mean": 7.047097682952881, "rewards/fitness_reward/std": 1.7767457962036133, "rewards/kidney_reward/mean": 2.4707627296447754, "rewards/kidney_reward/std": 0.8808674216270447, "rewards/length2tails_reward/mean": 0.8012950420379639, "rewards/length2tails_reward/std": 0.25205302238464355, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.334329843521118, "rewards/thermo_reward/std": 1.284340739250183, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 274.25, "completions/mean_terminated_length": 274.25, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.09910607803612947, "epoch": 2.276, "frac_reward_zero_std": 0.0, "grad_norm": 0.1186261773109436, "learning_rate": 1.1760329585710397e-06, "loss": -0.0014, "num_tokens": 9915862.0, "reward": 12.83854866027832, "reward_std": 4.40946626663208, "rewards/fitness_reward/mean": 7.0206618309021, "rewards/fitness_reward/std": 1.9262902736663818, "rewards/kidney_reward/mean": 2.446737766265869, "rewards/kidney_reward/std": 1.0167741775512695, "rewards/length2tails_reward/mean": 0.8771859407424927, "rewards/length2tails_reward/std": 0.15988872945308685, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1834309101104736, "rewards/thermo_reward/std": 1.6658512353897095, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.84375, "completions/mean_terminated_length": 271.84375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09410636872053146, "epoch": 2.278, "frac_reward_zero_std": 0.0, "grad_norm": 0.08248171210289001, "learning_rate": 1.1747705552566717e-06, "loss": -0.0034, "num_tokens": 9924593.0, "reward": 13.137075424194336, "reward_std": 2.7558913230895996, "rewards/fitness_reward/mean": 6.993189334869385, "rewards/fitness_reward/std": 2.081698179244995, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.7481805086135864, "rewards/length2tails_reward/std": 0.2986763119697571, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4246654510498047, "rewards/thermo_reward/std": 1.154456615447998, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08954431302845478, "epoch": 2.2800000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.098535917699337, "learning_rate": 1.173507864576039e-06, "loss": 0.0059, "num_tokens": 9933321.0, "reward": 13.397356033325195, "reward_std": 1.8085975646972656, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.5208089351654053, "rewards/kidney_reward/std": 0.5977639555931091, "rewards/length2tails_reward/mean": 0.7480281591415405, "rewards/length2tails_reward/std": 0.2698073089122772, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.398068428039551, "rewards/thermo_reward/std": 0.9680426120758057, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 272.9375, "completions/mean_terminated_length": 272.9375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.10984822269529104, "epoch": 2.282, "frac_reward_zero_std": 0.0, "grad_norm": 0.08918312191963196, "learning_rate": 1.172244888605319e-06, "loss": -0.0014, "num_tokens": 9942087.0, "reward": 13.5318603515625, "reward_std": 1.0554815530776978, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8391514420509338, "rewards/length2tails_reward/std": 0.19658353924751282, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.387639045715332, "rewards/thermo_reward/std": 1.0177863836288452, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.5625, "completions/mean_terminated_length": 273.5625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09403904061764479, "epoch": 2.284, "frac_reward_zero_std": 0.0, "grad_norm": 0.08600325882434845, "learning_rate": 1.1709816294211582e-06, "loss": -0.0041, "num_tokens": 9950873.0, "reward": 13.683900833129883, "reward_std": 1.0784289836883545, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8729538917541504, "rewards/length2tails_reward/std": 0.17817066609859467, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.563659429550171, "rewards/thermo_reward/std": 0.8868530988693237, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.78125, "completions/mean_terminated_length": 271.78125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09034307207912207, "epoch": 2.286, "frac_reward_zero_std": 0.0, "grad_norm": 0.09123056381940842, "learning_rate": 1.1697180891006689e-06, "loss": -0.002, "num_tokens": 9959602.0, "reward": 13.693422317504883, "reward_std": 0.8599316477775574, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7169778347015381, "rewards/length2tails_reward/std": 0.33215200901031494, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5340588092803955, "rewards/thermo_reward/std": 0.8522934317588806, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.53125, "completions/mean_terminated_length": 271.53125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09334644302725792, "epoch": 2.288, "frac_reward_zero_std": 0.0, "grad_norm": 0.09386125952005386, "learning_rate": 1.168454269721426e-06, "loss": -0.0031, "num_tokens": 9968323.0, "reward": 13.636048316955566, "reward_std": 1.0688204765319824, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7532642483711243, "rewards/length2tails_reward/std": 0.25923576951026917, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5277750492095947, "rewards/thermo_reward/std": 0.883216381072998, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.53125, "completions/mean_terminated_length": 273.53125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10313539113849401, "epoch": 2.29, "frac_reward_zero_std": 0.0, "grad_norm": 0.08303596824407578, "learning_rate": 1.1671901733614627e-06, "loss": -0.0061, "num_tokens": 9977108.0, "reward": 13.582897186279297, "reward_std": 1.599317193031311, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.51045298576355, "rewards/kidney_reward/std": 0.5201210975646973, "rewards/length2tails_reward/mean": 0.8479992151260376, "rewards/length2tails_reward/std": 0.20036792755126953, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.526458740234375, "rewards/thermo_reward/std": 1.0810075998306274, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.125, "completions/mean_terminated_length": 272.125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09446029318496585, "epoch": 2.292, "frac_reward_zero_std": 0.0, "grad_norm": 0.12472908198833466, "learning_rate": 1.165925802099268e-06, "loss": -0.0012, "num_tokens": 9985848.0, "reward": 12.099523544311523, "reward_std": 5.365253448486328, "rewards/fitness_reward/mean": 6.966158390045166, "rewards/fitness_reward/std": 2.234607696533203, "rewards/kidney_reward/mean": 2.1472251415252686, "rewards/kidney_reward/std": 1.4632967710494995, "rewards/length2tails_reward/mean": 0.7171519994735718, "rewards/length2tails_reward/std": 0.31994375586509705, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.814424514770508, "rewards/thermo_reward/std": 2.0936083793640137, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.65625, "completions/mean_terminated_length": 270.65625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 0.09268662519752979, "epoch": 2.294, "frac_reward_zero_std": 0.0, "grad_norm": 0.09646332263946533, "learning_rate": 1.1646611580137823e-06, "loss": -0.0064, "num_tokens": 9994541.0, "reward": 13.49638557434082, "reward_std": 1.167270302772522, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.511590003967285, "rewards/kidney_reward/std": 0.5139835476875305, "rewards/length2tails_reward/mean": 0.7115417718887329, "rewards/length2tails_reward/std": 0.32849302887916565, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.50996470451355, "rewards/thermo_reward/std": 0.5615194439888, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.15625, "completions/mean_terminated_length": 273.15625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09854295663535595, "epoch": 2.296, "frac_reward_zero_std": 0.0, "grad_norm": 0.10047957301139832, "learning_rate": 1.1633962431843955e-06, "loss": -0.0029, "num_tokens": 10003314.0, "reward": 13.771848678588867, "reward_std": 0.5571531653404236, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8192107677459717, "rewards/length2tails_reward/std": 0.23006263375282288, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.6296226978302, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.0625, "completions/mean_terminated_length": 273.0625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08827311685308814, "epoch": 2.298, "frac_reward_zero_std": 0.0, "grad_norm": 0.1572035402059555, "learning_rate": 1.1621310596909421e-06, "loss": 0.0043, "num_tokens": 10012084.0, "reward": 13.377315521240234, "reward_std": 2.1593408584594727, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.501345157623291, "rewards/kidney_reward/std": 0.7078666090965271, "rewards/length2tails_reward/mean": 0.7940536737442017, "rewards/length2tails_reward/std": 0.29090404510498047, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3928890228271484, "rewards/thermo_reward/std": 1.1827491521835327, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.65625, "completions/mean_terminated_length": 271.65625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08063013199716806, "epoch": 2.3, "frac_reward_zero_std": 0.0, "grad_norm": 0.11249648779630661, "learning_rate": 1.1608656096136983e-06, "loss": 0.0014, "num_tokens": 10020809.0, "reward": 12.985828399658203, "reward_std": 4.395967483520508, "rewards/fitness_reward/mean": 7.006161212921143, "rewards/fitness_reward/std": 2.008317708969116, "rewards/kidney_reward/mean": 2.414630889892578, "rewards/kidney_reward/std": 1.0500935316085815, "rewards/length2tails_reward/mean": 0.7493383884429932, "rewards/length2tails_reward/std": 0.27332738041877747, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3901023864746094, "rewards/thermo_reward/std": 1.3996142148971558, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.96875, "completions/mean_terminated_length": 272.96875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09177826624363661, "epoch": 2.302, "frac_reward_zero_std": 0.0, "grad_norm": 0.19721175730228424, "learning_rate": 1.1595998950333793e-06, "loss": 0.0036, "num_tokens": 10029576.0, "reward": 13.098941802978516, "reward_std": 3.715373992919922, "rewards/fitness_reward/mean": 7.051706314086914, "rewards/fitness_reward/std": 1.7506755590438843, "rewards/kidney_reward/mean": 2.480517625808716, "rewards/kidney_reward/std": 0.8256861567497253, "rewards/length2tails_reward/mean": 0.8044684529304504, "rewards/length2tails_reward/std": 0.25673922896385193, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3862717151641846, "rewards/thermo_reward/std": 1.2167147397994995, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.90625, "completions/mean_terminated_length": 273.90625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08292252477258444, "epoch": 2.304, "frac_reward_zero_std": 0.0, "grad_norm": 0.14803364872932434, "learning_rate": 1.158333918031134e-06, "loss": -0.0003, "num_tokens": 10038373.0, "reward": 12.912191390991211, "reward_std": 4.401137351989746, "rewards/fitness_reward/mean": 7.018555164337158, "rewards/fitness_reward/std": 1.938206672668457, "rewards/kidney_reward/mean": 2.420194625854492, "rewards/kidney_reward/std": 1.0189740657806396, "rewards/length2tails_reward/mean": 0.8237855434417725, "rewards/length2tails_reward/std": 0.27030956745147705, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.291062593460083, "rewards/thermo_reward/std": 1.5103964805603027, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.6875, "completions/mean_terminated_length": 273.6875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08876392152160406, "epoch": 2.306, "frac_reward_zero_std": 0.0, "grad_norm": 0.10971035063266754, "learning_rate": 1.157067680688544e-06, "loss": 0.003, "num_tokens": 10047163.0, "reward": 13.720508575439453, "reward_std": 0.5333245396614075, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.829923152923584, "rewards/length2tails_reward/std": 0.23732823133468628, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5498507022857666, "rewards/thermo_reward/std": 0.5360844731330872, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 271.34375, "completions/mean_terminated_length": 271.34375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09525459352880716, "epoch": 2.308, "frac_reward_zero_std": 0.0, "grad_norm": 0.0928049087524414, "learning_rate": 1.1558011850876181e-06, "loss": -0.0003, "num_tokens": 10055878.0, "reward": 13.224510192871094, "reward_std": 1.984014630317688, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5322229862213135, "rewards/kidney_reward/std": 0.5331960916519165, "rewards/length2tails_reward/mean": 0.7088586688041687, "rewards/length2tails_reward/std": 0.3383852541446686, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.160215377807617, "rewards/thermo_reward/std": 1.6182459592819214, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.34375, "completions/mean_terminated_length": 270.34375, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.09121548756957054, "epoch": 2.31, "frac_reward_zero_std": 0.0, "grad_norm": 0.08748360723257065, "learning_rate": 1.1545344333107904e-06, "loss": -0.0005, "num_tokens": 10064561.0, "reward": 13.696756362915039, "reward_std": 0.868507444858551, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7743874788284302, "rewards/length2tails_reward/std": 0.262215256690979, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5316529273986816, "rewards/thermo_reward/std": 0.8640903234481812, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 273.90625, "completions/mean_terminated_length": 273.90625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.092183624394238, "epoch": 2.312, "frac_reward_zero_std": 0.0, "grad_norm": 0.14290811121463776, "learning_rate": 1.1532674274409157e-06, "loss": 0.0001, "num_tokens": 10073358.0, "reward": 13.178348541259766, "reward_std": 1.9608882665634155, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4581027030944824, "rewards/kidney_reward/std": 0.6685106158256531, "rewards/length2tails_reward/mean": 0.8021647930145264, "rewards/length2tails_reward/std": 0.2939205467700958, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1788454055786133, "rewards/thermo_reward/std": 1.3569666147232056, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.78125, "completions/mean_terminated_length": 273.78125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09328475221991539, "epoch": 2.314, "frac_reward_zero_std": 0.0, "grad_norm": 0.07290919125080109, "learning_rate": 1.1520001695612673e-06, "loss": -0.0031, "num_tokens": 10082151.0, "reward": 13.920624732971191, "reward_std": 0.3185406029224396, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8367986679077148, "rewards/length2tails_reward/std": 0.23183605074882507, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.53125, "completions/mean_terminated_length": 271.53125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09928023163229227, "epoch": 2.316, "frac_reward_zero_std": 0.0, "grad_norm": 0.14347343146800995, "learning_rate": 1.1507326617555312e-06, "loss": 0.0055, "num_tokens": 10090872.0, "reward": 13.569886207580566, "reward_std": 1.0947325229644775, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7327724695205688, "rewards/length2tails_reward/std": 0.2740943431854248, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4363033771514893, "rewards/thermo_reward/std": 0.9662889838218689, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.96875, "completions/mean_terminated_length": 272.96875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08951482083648443, "epoch": 2.318, "frac_reward_zero_std": 0.0, "grad_norm": 0.11145684123039246, "learning_rate": 1.1494649061078069e-06, "loss": 0.0015, "num_tokens": 10099639.0, "reward": 13.879165649414062, "reward_std": 0.37595221400260925, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8210693597793579, "rewards/length2tails_reward/std": 0.2428348809480667, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.4375, "completions/mean_terminated_length": 272.4375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09583273157477379, "epoch": 2.32, "frac_reward_zero_std": 0.0, "grad_norm": 0.11117927730083466, "learning_rate": 1.1481969047025993e-06, "loss": -0.0009, "num_tokens": 10108389.0, "reward": 13.457784652709961, "reward_std": 1.157409429550171, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.787238359451294, "rewards/length2tails_reward/std": 0.26374247670173645, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3187546730041504, "rewards/thermo_reward/std": 1.122248888015747, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.65625, "completions/mean_terminated_length": 271.65625, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "entropy": 0.08784770919010043, "epoch": 2.322, "frac_reward_zero_std": 0.0, "grad_norm": 0.14624707400798798, "learning_rate": 1.146928659624818e-06, "loss": -0.0034, "num_tokens": 10117114.0, "reward": 13.683073997497559, "reward_std": 0.8640279173851013, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7801396250724792, "rewards/length2tails_reward/std": 0.26757025718688965, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.28125, "completions/mean_terminated_length": 272.28125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09903733339160681, "epoch": 2.324, "frac_reward_zero_std": 0.0, "grad_norm": 0.0889928787946701, "learning_rate": 1.1456601729597735e-06, "loss": -0.0051, "num_tokens": 10125859.0, "reward": 13.514886856079102, "reward_std": 1.2132494449615479, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.768134593963623, "rewards/length2tails_reward/std": 0.2764660120010376, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.377768039703369, "rewards/thermo_reward/std": 1.0657364130020142, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.3125, "completions/mean_terminated_length": 273.3125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.0959858438000083, "epoch": 2.326, "frac_reward_zero_std": 0.0, "grad_norm": 0.13234223425388336, "learning_rate": 1.1443914467931734e-06, "loss": 0.0014, "num_tokens": 10134637.0, "reward": 13.181429862976074, "reward_std": 2.5910279750823975, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.650642991065979, "rewards/kidney_reward/mean": 2.4817724227905273, "rewards/kidney_reward/std": 0.8185869455337524, "rewards/length2tails_reward/mean": 0.823712944984436, "rewards/length2tails_reward/std": 0.24324999749660492, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2711193561553955, "rewards/thermo_reward/std": 1.2079739570617676, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.8125, "completions/mean_terminated_length": 271.8125, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "entropy": 0.0841765240766108, "epoch": 2.328, "frac_reward_zero_std": 0.0, "grad_norm": 0.20495650172233582, "learning_rate": 1.1431224832111194e-06, "loss": 0.0053, "num_tokens": 10143367.0, "reward": 13.428060531616211, "reward_std": 2.060119867324829, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.5146913528442383, "rewards/kidney_reward/std": 0.6323707103729248, "rewards/length2tails_reward/mean": 0.7269923686981201, "rewards/length2tails_reward/std": 0.30764666199684143, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4369945526123047, "rewards/thermo_reward/std": 1.153154730796814, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 270.96875, "completions/mean_terminated_length": 270.96875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.08938687853515148, "epoch": 2.33, "frac_reward_zero_std": 0.0, "grad_norm": 0.16077359020709991, "learning_rate": 1.141853284300103e-06, "loss": 0.0022, "num_tokens": 10152070.0, "reward": 13.645215034484863, "reward_std": 0.6276077628135681, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7494490742683411, "rewards/length2tails_reward/std": 0.3037169277667999, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5099644660949707, "rewards/thermo_reward/std": 0.5615194439888, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.1875, "completions/mean_terminated_length": 272.1875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08419801201671362, "epoch": 2.332, "frac_reward_zero_std": 0.0, "grad_norm": 0.3486080467700958, "learning_rate": 1.1405838521470028e-06, "loss": 0.0009, "num_tokens": 10160812.0, "reward": 13.514474868774414, "reward_std": 1.1845166683197021, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7699288725852966, "rewards/length2tails_reward/std": 0.27235648036003113, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4045357704162598, "rewards/thermo_reward/std": 1.1235049962997437, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.6875, "completions/mean_terminated_length": 272.6875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09492563083767891, "epoch": 2.334, "frac_reward_zero_std": 0.0, "grad_norm": 0.08199910819530487, "learning_rate": 1.1393141888390813e-06, "loss": -0.0037, "num_tokens": 10169570.0, "reward": 13.191213607788086, "reward_std": 2.442258834838867, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.3957366943359375, "rewards/kidney_reward/std": 1.0089517831802368, "rewards/length2tails_reward/mean": 0.7714502811431885, "rewards/length2tails_reward/std": 0.2730657458305359, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2571463584899902, "rewards/thermo_reward/std": 1.5108851194381714, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.96875, "completions/mean_terminated_length": 272.96875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08890333492308855, "epoch": 2.336, "frac_reward_zero_std": 0.0, "grad_norm": 0.12107077986001968, "learning_rate": 1.1380442964639804e-06, "loss": 0.0003, "num_tokens": 10178337.0, "reward": 12.145477294921875, "reward_std": 4.592377185821533, "rewards/fitness_reward/mean": 7.028352737426758, "rewards/fitness_reward/std": 1.8827823400497437, "rewards/kidney_reward/mean": 2.28318452835083, "rewards/kidney_reward/std": 1.0548654794692993, "rewards/length2tails_reward/mean": 0.7793346643447876, "rewards/length2tails_reward/std": 0.3058479130268097, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.6560065746307373, "rewards/thermo_reward/std": 2.263650417327881, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.0625, "completions/mean_terminated_length": 273.0625, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.10207834374159575, "epoch": 2.338, "frac_reward_zero_std": 0.0, "grad_norm": 0.12703408300876617, "learning_rate": 1.1367741771097197e-06, "loss": 0.0024, "num_tokens": 10187107.0, "reward": 13.797447204589844, "reward_std": 0.4676748812198639, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8015918135643005, "rewards/length2tails_reward/std": 0.20477145910263062, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "entropy": 0.09466278459876776, "epoch": 2.34, "frac_reward_zero_std": 0.0, "grad_norm": 0.0807986930012703, "learning_rate": 1.135503832864691e-06, "loss": -0.0059, "num_tokens": 10195835.0, "reward": 12.499326705932617, "reward_std": 3.7159574031829834, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.347208023071289, "rewards/kidney_reward/std": 0.8026228547096252, "rewards/length2tails_reward/mean": 0.7313401699066162, "rewards/length2tails_reward/std": 0.3534490168094635, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9259305000305176, "rewards/thermo_reward/std": 1.93488347530365, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.96875, "completions/mean_terminated_length": 271.96875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09521954506635666, "epoch": 2.342, "frac_reward_zero_std": 0.0, "grad_norm": 0.12933559715747833, "learning_rate": 1.1342332658176555e-06, "loss": -0.0027, "num_tokens": 10204570.0, "reward": 13.178197860717773, "reward_std": 4.378511905670166, "rewards/fitness_reward/mean": 7.008818626403809, "rewards/fitness_reward/std": 1.993286371231079, "rewards/kidney_reward/mean": 2.441524028778076, "rewards/kidney_reward/std": 1.0462663173675537, "rewards/length2tails_reward/mean": 0.7510663270950317, "rewards/length2tails_reward/std": 0.26452168822288513, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.552748441696167, "rewards/thermo_reward/std": 1.3490850925445557, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 273.3125, "completions/mean_terminated_length": 273.3125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08618407044559717, "epoch": 2.344, "frac_reward_zero_std": 0.0, "grad_norm": 0.06609036028385162, "learning_rate": 1.1329624780577425e-06, "loss": 0.0004, "num_tokens": 10213348.0, "reward": 13.34747314453125, "reward_std": 1.7441539764404297, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5383336544036865, "rewards/kidney_reward/std": 0.49862852692604065, "rewards/length2tails_reward/mean": 0.8297273516654968, "rewards/length2tails_reward/std": 0.24132372438907623, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.264981746673584, "rewards/thermo_reward/std": 1.4160000085830688, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 271.71875, "completions/mean_terminated_length": 271.71875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09455079026520252, "epoch": 2.346, "frac_reward_zero_std": 0.0, "grad_norm": 0.11646193265914917, "learning_rate": 1.1316914716744426e-06, "loss": -0.006, "num_tokens": 10222075.0, "reward": 13.316307067871094, "reward_std": 1.4536793231964111, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7493178844451904, "rewards/length2tails_reward/std": 0.2923212945461273, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1537110805511475, "rewards/thermo_reward/std": 1.4431830644607544, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.40625, "completions/mean_terminated_length": 272.40625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08919279649853706, "epoch": 2.348, "frac_reward_zero_std": 0.0, "grad_norm": 0.1078300029039383, "learning_rate": 1.1304202487576066e-06, "loss": 0.0005, "num_tokens": 10230824.0, "reward": 13.742897033691406, "reward_std": 0.6213005781173706, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8032835125923157, "rewards/length2tails_reward/std": 0.23592759668827057, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.4375, "completions/mean_terminated_length": 272.4375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.102267200127244, "epoch": 2.35, "frac_reward_zero_std": 0.0, "grad_norm": 0.06344044208526611, "learning_rate": 1.1291488113974415e-06, "loss": -0.0041, "num_tokens": 10239574.0, "reward": 13.540192604064941, "reward_std": 1.4909051656723022, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7741001844406128, "rewards/length2tails_reward/std": 0.24528786540031433, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4024767875671387, "rewards/thermo_reward/std": 1.3338748216629028, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.96875, "completions/mean_terminated_length": 271.96875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09431599359959364, "epoch": 2.352, "frac_reward_zero_std": 0.0, "grad_norm": 0.09179411083459854, "learning_rate": 1.127877161684506e-06, "loss": -0.0008, "num_tokens": 10248309.0, "reward": 13.469427108764648, "reward_std": 1.0344244241714478, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7367984056472778, "rewards/length2tails_reward/std": 0.27189457416534424, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.335442543029785, "rewards/thermo_reward/std": 0.9056594371795654, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.84375, "completions/mean_terminated_length": 272.84375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08454183116555214, "epoch": 2.354, "frac_reward_zero_std": 0.0, "grad_norm": 0.10728532820940018, "learning_rate": 1.126605301709709e-06, "loss": -0.0042, "num_tokens": 10257072.0, "reward": 13.568931579589844, "reward_std": 1.9710923433303833, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5000758171081543, "rewards/kidney_reward/std": 0.7150477766990662, "rewards/length2tails_reward/mean": 0.7729551792144775, "rewards/length2tails_reward/std": 0.25684475898742676, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.530374526977539, "rewards/thermo_reward/std": 1.26329505443573, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.84375, "completions/mean_terminated_length": 272.84375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09735297691076994, "epoch": 2.356, "frac_reward_zero_std": 0.0, "grad_norm": 0.1453525424003601, "learning_rate": 1.1253332335643042e-06, "loss": 0.0013, "num_tokens": 10265835.0, "reward": 13.430132865905762, "reward_std": 1.3856492042541504, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.7720500230789185, "rewards/length2tails_reward/std": 0.25347065925598145, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3473410606384277, "rewards/thermo_reward/std": 1.158356785774231, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.96875, "completions/mean_terminated_length": 271.96875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09220325388014317, "epoch": 2.358, "frac_reward_zero_std": 0.0, "grad_norm": 0.1444585621356964, "learning_rate": 1.1240609593398884e-06, "loss": -0.0003, "num_tokens": 10274570.0, "reward": 12.636201858520508, "reward_std": 5.048321723937988, "rewards/fitness_reward/mean": 7.012203216552734, "rewards/fitness_reward/std": 1.9741390943527222, "rewards/kidney_reward/mean": 2.2174811363220215, "rewards/kidney_reward/std": 1.506496548652649, "rewards/length2tails_reward/mean": 0.7474431395530701, "rewards/length2tails_reward/std": 0.31676745414733887, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2317728996276855, "rewards/thermo_reward/std": 1.8967760801315308, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.65625, "completions/mean_terminated_length": 272.65625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.0881521599367261, "epoch": 2.36, "frac_reward_zero_std": 0.0, "grad_norm": 0.12496333569288254, "learning_rate": 1.122788481128397e-06, "loss": -0.0008, "num_tokens": 10283327.0, "reward": 13.534195899963379, "reward_std": 1.1190105676651, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.784372866153717, "rewards/length2tails_reward/std": 0.25296303629875183, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.368093967437744, "rewards/thermo_reward/std": 1.1134434938430786, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.40625, "completions/mean_terminated_length": 271.40625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.08923059329390526, "epoch": 2.362, "frac_reward_zero_std": 0.0, "grad_norm": 0.061542972922325134, "learning_rate": 1.1215158010221004e-06, "loss": -0.0036, "num_tokens": 10292044.0, "reward": 13.394866943359375, "reward_std": 2.238596200942993, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.5037803649902344, "rewards/kidney_reward/std": 0.6940914392471313, "rewards/length2tails_reward/mean": 0.6941548585891724, "rewards/length2tails_reward/std": 0.35992923378944397, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4179959297180176, "rewards/thermo_reward/std": 1.2520838975906372, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 754.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 287.71875, "completions/mean_terminated_length": 272.6773986816406, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.1059657009318471, "epoch": 2.364, "frac_reward_zero_std": 0.0, "grad_norm": 0.5759685635566711, "learning_rate": 1.1202429211136011e-06, "loss": -0.0212, "num_tokens": 10301283.0, "reward": 13.834796905517578, "reward_std": 0.4251575767993927, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7762397527694702, "rewards/length2tails_reward/std": 0.2769671082496643, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09039780683815479, "epoch": 2.366, "frac_reward_zero_std": 0.0, "grad_norm": 0.1149810180068016, "learning_rate": 1.1189698434958308e-06, "loss": 0.0005, "num_tokens": 10310027.0, "reward": 13.576425552368164, "reward_std": 1.3609395027160645, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7675727605819702, "rewards/length2tails_reward/std": 0.2958059012889862, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4393632411956787, "rewards/thermo_reward/std": 1.2457044124603271, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.5625, "completions/mean_terminated_length": 273.5625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08911374118179083, "epoch": 2.368, "frac_reward_zero_std": 0.0, "grad_norm": 0.10090884566307068, "learning_rate": 1.1176965702620453e-06, "loss": 0.0011, "num_tokens": 10318813.0, "reward": 13.879654884338379, "reward_std": 0.37785404920578003, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8259574174880981, "rewards/length2tails_reward/std": 0.26726025342941284, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.1875, "completions/mean_terminated_length": 272.1875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09442420396953821, "epoch": 2.37, "frac_reward_zero_std": 0.0, "grad_norm": 0.09971757233142853, "learning_rate": 1.1164231035058227e-06, "loss": -0.0011, "num_tokens": 10327555.0, "reward": 13.422538757324219, "reward_std": 1.3540540933609009, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7836484909057617, "rewards/length2tails_reward/std": 0.2161341905593872, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2838685512542725, "rewards/thermo_reward/std": 1.246479868888855, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.9375, "completions/mean_terminated_length": 272.9375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.10214246436953545, "epoch": 2.372, "frac_reward_zero_std": 0.0, "grad_norm": 0.1652139127254486, "learning_rate": 1.1151494453210594e-06, "loss": -0.0042, "num_tokens": 10336321.0, "reward": 13.216739654541016, "reward_std": 2.033397912979126, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.510984420776367, "rewards/kidney_reward/std": 0.5172518491744995, "rewards/length2tails_reward/mean": 0.8202112913131714, "rewards/length2tails_reward/std": 0.19941861927509308, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1625497341156006, "rewards/thermo_reward/std": 1.6370229721069336, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.46875, "completions/mean_terminated_length": 272.46875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10318686440587044, "epoch": 2.374, "frac_reward_zero_std": 0.0, "grad_norm": 0.12082388252019882, "learning_rate": 1.1138755978019656e-06, "loss": 0.0042, "num_tokens": 10345072.0, "reward": 13.59936237335205, "reward_std": 1.057921051979065, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7763901948928833, "rewards/length2tails_reward/std": 0.2672812044620514, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4614181518554688, "rewards/thermo_reward/std": 1.0285004377365112, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.09375, "completions/mean_terminated_length": 272.09375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09636431373655796, "epoch": 2.376, "frac_reward_zero_std": 0.0, "grad_norm": 0.06303004920482635, "learning_rate": 1.1126015630430642e-06, "loss": -0.0074, "num_tokens": 10353811.0, "reward": 13.065311431884766, "reward_std": 3.1626193523406982, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.511730670928955, "rewards/kidney_reward/std": 0.5132253766059875, "rewards/length2tails_reward/mean": 0.7315313220024109, "rewards/length2tails_reward/std": 0.31727486848831177, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.32737398147583, "rewards/thermo_reward/std": 1.237959861755371, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 266.375, "completions/mean_terminated_length": 266.375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.08927316032350063, "epoch": 2.378, "frac_reward_zero_std": 0.0, "grad_norm": 2.4410877227783203, "learning_rate": 1.1113273431391847e-06, "loss": -0.1267, "num_tokens": 10362367.0, "reward": 12.523245811462402, "reward_std": 5.3058390617370605, "rewards/fitness_reward/mean": 6.6293439865112305, "rewards/fitness_reward/std": 2.8821799755096436, "rewards/kidney_reward/mean": 2.3808608055114746, "rewards/kidney_reward/std": 1.239362120628357, "rewards/length2tails_reward/mean": 0.8259913325309753, "rewards/length2tails_reward/std": 0.2122562676668167, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.330442428588867, "rewards/thermo_reward/std": 1.5098158121109009, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.28125, "completions/mean_terminated_length": 272.28125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09383781487122178, "epoch": 2.38, "frac_reward_zero_std": 0.0, "grad_norm": 0.08897305279970169, "learning_rate": 1.1100529401854616e-06, "loss": -0.0058, "num_tokens": 10371112.0, "reward": 13.609582901000977, "reward_std": 0.6566846966743469, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7919836640357971, "rewards/length2tails_reward/std": 0.25463926792144775, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.470078706741333, "rewards/thermo_reward/std": 0.5830413699150085, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.5625, "completions/mean_terminated_length": 272.5625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09251491725444794, "epoch": 2.382, "frac_reward_zero_std": 0.0, "grad_norm": 0.1091945618391037, "learning_rate": 1.108778356277331e-06, "loss": 0.0003, "num_tokens": 10379866.0, "reward": 13.770999908447266, "reward_std": 0.55600905418396, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8107157945632935, "rewards/length2tails_reward/std": 0.24261504411697388, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.53125, "completions/mean_terminated_length": 271.53125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08884621225297451, "epoch": 2.384, "frac_reward_zero_std": 0.0, "grad_norm": 0.08674254268407822, "learning_rate": 1.107503593510525e-06, "loss": -0.0052, "num_tokens": 10388587.0, "reward": 12.903682708740234, "reward_std": 4.312568187713623, "rewards/fitness_reward/mean": 7.004302978515625, "rewards/fitness_reward/std": 2.018831729888916, "rewards/kidney_reward/mean": 2.4614100456237793, "rewards/kidney_reward/std": 0.9337738752365112, "rewards/length2tails_reward/mean": 0.6988670825958252, "rewards/length2tails_reward/std": 0.32447734475135803, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2680823802948, "rewards/thermo_reward/std": 1.4241454601287842, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.84375, "completions/mean_terminated_length": 271.84375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.08987856283783913, "epoch": 2.386, "frac_reward_zero_std": 0.0, "grad_norm": 0.07334306091070175, "learning_rate": 1.106228653981071e-06, "loss": -0.0067, "num_tokens": 10397318.0, "reward": 13.505498886108398, "reward_std": 0.9637717008590698, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7330926656723022, "rewards/length2tails_reward/std": 0.3236958384513855, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.371884346008301, "rewards/thermo_reward/std": 0.9134882688522339, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.34375, "completions/mean_terminated_length": 273.34375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09814102575182915, "epoch": 2.388, "frac_reward_zero_std": 0.0, "grad_norm": 0.11979229003190994, "learning_rate": 1.1049535397852871e-06, "loss": 0.0004, "num_tokens": 10406097.0, "reward": 13.654312133789062, "reward_std": 0.6249982714653015, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8404203653335571, "rewards/length2tails_reward/std": 0.22256071865558624, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.509964942932129, "rewards/thermo_reward/std": 0.5615193843841553, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.3125, "completions/mean_terminated_length": 272.3125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0880571873858571, "epoch": 2.39, "frac_reward_zero_std": 0.0, "grad_norm": 0.0868200808763504, "learning_rate": 1.1036782530197775e-06, "loss": 0.0036, "num_tokens": 10414843.0, "reward": 13.65894889831543, "reward_std": 0.6650402545928955, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7615159749984741, "rewards/length2tails_reward/std": 0.25980833172798157, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5498504638671875, "rewards/thermo_reward/std": 0.5360844731330872, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.625, "completions/mean_terminated_length": 273.625, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "entropy": 0.09931143745779991, "epoch": 2.392, "frac_reward_zero_std": 0.0, "grad_norm": 0.06371911615133286, "learning_rate": 1.1024027957814312e-06, "loss": -0.0045, "num_tokens": 10423631.0, "reward": 13.089361190795898, "reward_std": 3.0056564807891846, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.511730670928955, "rewards/kidney_reward/std": 0.5132253766059875, "rewards/length2tails_reward/mean": 0.8789088726043701, "rewards/length2tails_reward/std": 0.20397359132766724, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.336686372756958, "rewards/thermo_reward/std": 0.9002619385719299, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.53125, "completions/mean_terminated_length": 272.53125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09386175684630871, "epoch": 2.394, "frac_reward_zero_std": 0.0, "grad_norm": 0.09913461655378342, "learning_rate": 1.1011271701674176e-06, "loss": -0.0059, "num_tokens": 10432384.0, "reward": 13.630131721496582, "reward_std": 0.9156928062438965, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7360557317733765, "rewards/length2tails_reward/std": 0.3140716552734375, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4962210655212402, "rewards/thermo_reward/std": 0.8580461740493774, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.90625, "completions/mean_terminated_length": 272.90625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09354142751544714, "epoch": 2.396, "frac_reward_zero_std": 0.0, "grad_norm": 0.10588572919368744, "learning_rate": 1.099851378275182e-06, "loss": 0.005, "num_tokens": 10441149.0, "reward": 13.799158096313477, "reward_std": 0.466749906539917, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8187099695205688, "rewards/length2tails_reward/std": 0.1727634072303772, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.125, "completions/mean_terminated_length": 271.125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08997735194861889, "epoch": 2.398, "frac_reward_zero_std": 0.0, "grad_norm": 0.08149711787700653, "learning_rate": 1.0985754222024436e-06, "loss": -0.0021, "num_tokens": 10449857.0, "reward": 13.479462623596191, "reward_std": 1.150247573852539, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.6816555261611938, "rewards/length2tails_reward/std": 0.3275473713874817, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.323631763458252, "rewards/thermo_reward/std": 1.1417499780654907, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.09375, "completions/mean_terminated_length": 271.09375, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 0.09271670412272215, "epoch": 2.4, "frac_reward_zero_std": 0.0, "grad_norm": 0.0913810133934021, "learning_rate": 1.0972993040471917e-06, "loss": -0.0008, "num_tokens": 10458564.0, "reward": 13.827293395996094, "reward_std": 0.4297630786895752, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7012042999267578, "rewards/length2tails_reward/std": 0.305792897939682, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.03125, "completions/mean_terminated_length": 272.03125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08737404271960258, "epoch": 2.402, "frac_reward_zero_std": 0.0, "grad_norm": 0.09364177286624908, "learning_rate": 1.0960230259076817e-06, "loss": 0.0026, "num_tokens": 10467301.0, "reward": 13.834736824035645, "reward_std": 0.42323192954063416, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7756407260894775, "rewards/length2tails_reward/std": 0.25788721442222595, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.21875, "completions/mean_terminated_length": 271.21875, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "entropy": 0.09901608899235725, "epoch": 2.404, "frac_reward_zero_std": 0.0, "grad_norm": 0.08422534912824631, "learning_rate": 1.0947465898824328e-06, "loss": -0.0038, "num_tokens": 10476012.0, "reward": 13.394966125488281, "reward_std": 2.5611391067504883, "rewards/fitness_reward/mean": 6.979666709899902, "rewards/fitness_reward/std": 2.158194065093994, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8655651211738586, "rewards/length2tails_reward/std": 0.21018658578395844, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.90625, "completions/mean_terminated_length": 272.90625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09906727354973555, "epoch": 2.406, "frac_reward_zero_std": 0.0, "grad_norm": 0.06459254771471024, "learning_rate": 1.093469998070223e-06, "loss": -0.0053, "num_tokens": 10484777.0, "reward": 13.304363250732422, "reward_std": 3.006488084793091, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.5390896797180176, "rewards/kidney_reward/std": 0.49435171484947205, "rewards/length2tails_reward/mean": 0.7610428333282471, "rewards/length2tails_reward/std": 0.3280724883079529, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.536116123199463, "rewards/thermo_reward/std": 0.8422486186027527, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.0, "completions/mean_terminated_length": 272.0, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09012809442356229, "epoch": 2.408, "frac_reward_zero_std": 0.0, "grad_norm": 0.16831642389297485, "learning_rate": 1.0921932525700868e-06, "loss": 0.0009, "num_tokens": 10493513.0, "reward": 13.215476036071777, "reward_std": 1.5913658142089844, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7565579414367676, "rewards/length2tails_reward/std": 0.25465884804725647, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1068739891052246, "rewards/thermo_reward/std": 1.4153202772140503, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.78125, "completions/mean_terminated_length": 273.78125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08467008266597986, "epoch": 2.41, "frac_reward_zero_std": 0.0, "grad_norm": 0.08127164095640182, "learning_rate": 1.0909163554813116e-06, "loss": 0.001, "num_tokens": 10502306.0, "reward": 13.799966812133789, "reward_std": 0.47498491406440735, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8267866969108582, "rewards/length2tails_reward/std": 0.26131075620651245, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.03125, "completions/mean_terminated_length": 272.03125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09263253770768642, "epoch": 2.412, "frac_reward_zero_std": 0.0, "grad_norm": 0.09587337076663971, "learning_rate": 1.0896393089034335e-06, "loss": -0.001, "num_tokens": 10511043.0, "reward": 13.44931411743164, "reward_std": 1.5483500957489014, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7766602039337158, "rewards/length2tails_reward/std": 0.2561853229999542, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3113417625427246, "rewards/thermo_reward/std": 1.403664469718933, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.96875, "completions/mean_terminated_length": 271.96875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08940061461180449, "epoch": 2.414, "frac_reward_zero_std": 0.0, "grad_norm": 0.08128293603658676, "learning_rate": 1.088362114936235e-06, "loss": -0.0026, "num_tokens": 10519778.0, "reward": 13.449129104614258, "reward_std": 1.4584228992462158, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.7604248523712158, "rewards/length2tails_reward/std": 0.30569180846214294, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.367499589920044, "rewards/thermo_reward/std": 1.2244524955749512, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09399153850972652, "epoch": 2.416, "frac_reward_zero_std": 0.0, "grad_norm": 0.14469492435455322, "learning_rate": 1.0870847756797404e-06, "loss": -0.0013, "num_tokens": 10528542.0, "reward": 13.58527660369873, "reward_std": 1.3392078876495361, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7924588322639465, "rewards/length2tails_reward/std": 0.2551293969154358, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4730844497680664, "rewards/thermo_reward/std": 1.1635582447052002, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.5625, "completions/mean_terminated_length": 272.5625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08919752668589354, "epoch": 2.418, "frac_reward_zero_std": 0.0, "grad_norm": 0.08864497393369675, "learning_rate": 1.0858072932342132e-06, "loss": -0.0005, "num_tokens": 10537296.0, "reward": 12.8340425491333, "reward_std": 4.473905563354492, "rewards/fitness_reward/mean": 7.023777484893799, "rewards/fitness_reward/std": 1.9086664915084839, "rewards/kidney_reward/mean": 2.4164366722106934, "rewards/kidney_reward/std": 1.0399911403656006, "rewards/length2tails_reward/mean": 0.7797451019287109, "rewards/length2tails_reward/std": 0.2637164294719696, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2158541679382324, "rewards/thermo_reward/std": 1.8010083436965942, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 275.09375, "completions/mean_terminated_length": 275.09375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08536658063530922, "epoch": 2.42, "frac_reward_zero_std": 0.0, "grad_norm": 0.10550517588853836, "learning_rate": 1.0845296697001527e-06, "loss": 0.0029, "num_tokens": 10546131.0, "reward": 13.507648468017578, "reward_std": 1.3550305366516113, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8864538073539734, "rewards/length2tails_reward/std": 0.2049570232629776, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.386056900024414, "rewards/thermo_reward/std": 1.1557670831680298, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.71875, "completions/mean_terminated_length": 272.71875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0972055597230792, "epoch": 2.422, "frac_reward_zero_std": 0.0, "grad_norm": 0.09713931381702423, "learning_rate": 1.0832519071782892e-06, "loss": 0.0013, "num_tokens": 10554890.0, "reward": 13.635648727416992, "reward_std": 0.583570122718811, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7790554761886597, "rewards/length2tails_reward/std": 0.3008067309856415, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.470078706741333, "rewards/thermo_reward/std": 0.5830413699150085, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.1875, "completions/mean_terminated_length": 272.1875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08462123945355415, "epoch": 2.424, "frac_reward_zero_std": 0.0, "grad_norm": 0.07095858454704285, "learning_rate": 1.0819740077695824e-06, "loss": -0.0026, "num_tokens": 10563632.0, "reward": 13.505558013916016, "reward_std": 1.648931860923767, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7744401693344116, "rewards/length2tails_reward/std": 0.3092653453350067, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.395167350769043, "rewards/thermo_reward/std": 1.4293804168701172, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.53125, "completions/mean_terminated_length": 272.53125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09231602028012276, "epoch": 2.426, "frac_reward_zero_std": 0.0, "grad_norm": 0.11986473947763443, "learning_rate": 1.0806959735752173e-06, "loss": 0.0009, "num_tokens": 10572385.0, "reward": 13.415735244750977, "reward_std": 1.188355565071106, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.746727705001831, "rewards/length2tails_reward/std": 0.320938378572464, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2807564735412598, "rewards/thermo_reward/std": 1.1604331731796265, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 271.15625, "completions/mean_terminated_length": 271.15625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0945893507450819, "epoch": 2.428, "frac_reward_zero_std": 0.0, "grad_norm": 0.12068387120962143, "learning_rate": 1.0794178066965993e-06, "loss": -0.0038, "num_tokens": 10581094.0, "reward": 13.645952224731445, "reward_std": 1.5051319599151611, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.531928062438965, "rewards/kidney_reward/std": 0.5348639488220215, "rewards/length2tails_reward/mean": 0.6977065801620483, "rewards/length2tails_reward/std": 0.31715697050094604, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.583068609237671, "rewards/thermo_reward/std": 0.9773446917533875, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.59375, "completions/mean_terminated_length": 273.59375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08710552845150232, "epoch": 2.43, "frac_reward_zero_std": 0.0, "grad_norm": 0.11045100539922714, "learning_rate": 1.0781395092353525e-06, "loss": -0.0033, "num_tokens": 10589881.0, "reward": 13.563175201416016, "reward_std": 1.3086121082305908, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8308136463165283, "rewards/length2tails_reward/std": 0.2591162621974945, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.477297306060791, "rewards/thermo_reward/std": 1.1414297819137573, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.6875, "completions/mean_terminated_length": 272.6875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09739651903510094, "epoch": 2.432, "frac_reward_zero_std": 0.0, "grad_norm": 0.1206652969121933, "learning_rate": 1.0768610832933167e-06, "loss": 0.0002, "num_tokens": 10598639.0, "reward": 13.21466064453125, "reward_std": 2.480855941772461, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.4809939861297607, "rewards/kidney_reward/std": 0.8229905962944031, "rewards/length2tails_reward/mean": 0.7944443225860596, "rewards/length2tails_reward/std": 0.25818219780921936, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.25054669380188, "rewards/thermo_reward/std": 1.5193787813186646, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.90625, "completions/mean_terminated_length": 273.90625, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.08703937754034996, "epoch": 2.434, "frac_reward_zero_std": 0.0, "grad_norm": 0.1481795758008957, "learning_rate": 1.0755825309725415e-06, "loss": 0.0023, "num_tokens": 10607436.0, "reward": 13.805276870727539, "reward_std": 0.468355268239975, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8799004554748535, "rewards/length2tails_reward/std": 0.12640152871608734, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.40625, "completions/mean_terminated_length": 272.40625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09083101619035006, "epoch": 2.436, "frac_reward_zero_std": 0.0, "grad_norm": 0.1297113448381424, "learning_rate": 1.0743038543752852e-06, "loss": 0.001, "num_tokens": 10616185.0, "reward": 13.531122207641602, "reward_std": 1.9644542932510376, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.5387279987335205, "rewards/kidney_reward/std": 0.49639827013015747, "rewards/length2tails_reward/mean": 0.7927061319351196, "rewards/length2tails_reward/std": 0.2188922017812729, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5094475746154785, "rewards/thermo_reward/std": 1.1716644763946533, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08744846004992723, "epoch": 2.438, "frac_reward_zero_std": 0.0, "grad_norm": 0.05081085115671158, "learning_rate": 1.07302505560401e-06, "loss": -0.0053, "num_tokens": 10624913.0, "reward": 13.099504470825195, "reward_std": 3.1085195541381836, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.5390896797180176, "rewards/kidney_reward/std": 0.49435171484947205, "rewards/length2tails_reward/mean": 0.757485568523407, "rewards/length2tails_reward/std": 0.2631629407405853, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3316128253936768, "rewards/thermo_reward/std": 1.2207468748092651, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.46875, "completions/mean_terminated_length": 272.46875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09191823564469814, "epoch": 2.44, "frac_reward_zero_std": 0.0, "grad_norm": 0.13226445019245148, "learning_rate": 1.0717461367613792e-06, "loss": -0.0005, "num_tokens": 10633664.0, "reward": 13.734261512756348, "reward_std": 0.8752428889274597, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8042384386062622, "rewards/length2tails_reward/std": 0.23499715328216553, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5661725997924805, "rewards/thermo_reward/std": 0.8740096092224121, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.15625, "completions/mean_terminated_length": 273.15625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09383957739919424, "epoch": 2.442, "frac_reward_zero_std": 0.0, "grad_norm": 0.11894011497497559, "learning_rate": 1.0704670999502538e-06, "loss": 0.0035, "num_tokens": 10642437.0, "reward": 13.813368797302246, "reward_std": 0.5172167420387268, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8355466723442078, "rewards/length2tails_reward/std": 0.21538470685482025, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.65625, "completions/mean_terminated_length": 272.65625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09643411543220282, "epoch": 2.444, "frac_reward_zero_std": 0.0, "grad_norm": 0.1086791530251503, "learning_rate": 1.0691879472736883e-06, "loss": 0.0005, "num_tokens": 10651194.0, "reward": 13.303426742553711, "reward_std": 1.154916524887085, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7456244826316833, "rewards/length2tails_reward/std": 0.30064287781715393, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.195917844772339, "rewards/thermo_reward/std": 1.0026649236679077, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.28125, "completions/mean_terminated_length": 273.28125, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.09604166820645332, "epoch": 2.446, "frac_reward_zero_std": 0.0, "grad_norm": 0.14771340787410736, "learning_rate": 1.0679086808349277e-06, "loss": -0.0001, "num_tokens": 10659971.0, "reward": 13.22848892211914, "reward_std": 2.40313458442688, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.4679837226867676, "rewards/kidney_reward/std": 0.7528902888298035, "rewards/length2tails_reward/mean": 0.847961962223053, "rewards/length2tails_reward/std": 0.18185418844223022, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2720327377319336, "rewards/thermo_reward/std": 1.4439270496368408, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 268.25, "completions/mean_terminated_length": 268.25, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.09932470787316561, "epoch": 2.448, "frac_reward_zero_std": 0.0, "grad_norm": 0.3754037618637085, "learning_rate": 1.0666293027374043e-06, "loss": -0.0708, "num_tokens": 10668587.0, "reward": 12.5902099609375, "reward_std": 5.2144060134887695, "rewards/fitness_reward/mean": 6.969270706176758, "rewards/fitness_reward/std": 2.2170021533966064, "rewards/kidney_reward/mean": 2.2978575229644775, "rewards/kidney_reward/std": 1.3816429376602173, "rewards/length2tails_reward/mean": 0.7470918893814087, "rewards/length2tails_reward/std": 0.27570077776908875, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1483712196350098, "rewards/thermo_reward/std": 1.9086298942565918, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.46875, "completions/mean_terminated_length": 273.46875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0799891110509634, "epoch": 2.45, "frac_reward_zero_std": 0.0, "grad_norm": 0.13458140194416046, "learning_rate": 1.0653498150847342e-06, "loss": -0.0015, "num_tokens": 10677370.0, "reward": 13.183582305908203, "reward_std": 2.0887210369110107, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5188488960266113, "rewards/kidney_reward/std": 0.6088516116142273, "rewards/length2tails_reward/mean": 0.8200899958610535, "rewards/length2tails_reward/std": 0.260699599981308, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.12153959274292, "rewards/thermo_reward/std": 1.5782240629196167, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.6875, "completions/mean_terminated_length": 272.6875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08691063802689314, "epoch": 2.452, "frac_reward_zero_std": 0.0, "grad_norm": 0.1246851459145546, "learning_rate": 1.064070219980713e-06, "loss": -0.0003, "num_tokens": 10686128.0, "reward": 13.246986389160156, "reward_std": 1.9061574935913086, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4933767318725586, "rewards/kidney_reward/std": 0.6130226850509644, "rewards/length2tails_reward/mean": 0.7921514511108398, "rewards/length2tails_reward/std": 0.25093090534210205, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.213209867477417, "rewards/thermo_reward/std": 1.4650269746780396, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.78125, "completions/mean_terminated_length": 272.78125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08808344416320324, "epoch": 2.454, "frac_reward_zero_std": 0.0, "grad_norm": 0.0639815554022789, "learning_rate": 1.0627905195293135e-06, "loss": -0.0061, "num_tokens": 10694889.0, "reward": 12.932845115661621, "reward_std": 3.411823272705078, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.4229860305786133, "rewards/kidney_reward/std": 0.7031325697898865, "rewards/length2tails_reward/mean": 0.786348819732666, "rewards/length2tails_reward/std": 0.23730994760990143, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.278170585632324, "rewards/thermo_reward/std": 1.4518730640411377, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.90625, "completions/mean_terminated_length": 272.90625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0898923072963953, "epoch": 2.456, "frac_reward_zero_std": 0.0, "grad_norm": 0.0836012214422226, "learning_rate": 1.0615107158346814e-06, "loss": -0.0034, "num_tokens": 10703654.0, "reward": 13.64627742767334, "reward_std": 0.9476872086524963, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7877658009529114, "rewards/length2tails_reward/std": 0.27643802762031555, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4798367023468018, "rewards/thermo_reward/std": 0.9370023608207703, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.59375, "completions/mean_terminated_length": 271.59375, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 0.09534856677055359, "epoch": 2.458, "frac_reward_zero_std": 0.0, "grad_norm": 0.14600451290607452, "learning_rate": 1.0602308110011326e-06, "loss": 0.0038, "num_tokens": 10712377.0, "reward": 13.80593490600586, "reward_std": 0.516539990901947, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7612037658691406, "rewards/length2tails_reward/std": 0.3153897821903229, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 272.0, "completions/mean_terminated_length": 272.0, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10472780279815197, "epoch": 2.46, "frac_reward_zero_std": 0.0, "grad_norm": 0.28879615664482117, "learning_rate": 1.0589508071331486e-06, "loss": 0.0049, "num_tokens": 10721113.0, "reward": 12.659233093261719, "reward_std": 4.623455047607422, "rewards/fitness_reward/mean": 6.967202663421631, "rewards/fitness_reward/std": 1.920608401298523, "rewards/kidney_reward/mean": 2.3063979148864746, "rewards/kidney_reward/std": 1.1783033609390259, "rewards/length2tails_reward/mean": 0.778078019618988, "rewards/length2tails_reward/std": 0.24348245561122894, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.20782470703125, "rewards/thermo_reward/std": 1.7583760023117065, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.96875, "completions/mean_terminated_length": 272.96875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09287342242896557, "epoch": 2.462, "frac_reward_zero_std": 0.0, "grad_norm": 0.17823663353919983, "learning_rate": 1.0576707063353745e-06, "loss": 0.0003, "num_tokens": 10729880.0, "reward": 12.828574180603027, "reward_std": 5.078460693359375, "rewards/fitness_reward/mean": 6.965703964233398, "rewards/fitness_reward/std": 2.23717999458313, "rewards/kidney_reward/mean": 2.368088722229004, "rewards/kidney_reward/std": 1.3110778331756592, "rewards/length2tails_reward/mean": 0.7966312170028687, "rewards/length2tails_reward/std": 0.2715126872062683, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3151190280914307, "rewards/thermo_reward/std": 1.5917843580245972, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08647519163787365, "epoch": 2.464, "frac_reward_zero_std": 0.0, "grad_norm": 0.06193329393863678, "learning_rate": 1.0563905107126144e-06, "loss": -0.0067, "num_tokens": 10738644.0, "reward": 13.224793434143066, "reward_std": 2.5808629989624023, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4222705364227295, "rewards/kidney_reward/std": 0.8070963025093079, "rewards/length2tails_reward/mean": 0.7824023962020874, "rewards/length2tails_reward/std": 0.2957296669483185, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2630972862243652, "rewards/thermo_reward/std": 1.7700457572937012, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.15625, "completions/mean_terminated_length": 272.15625, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "entropy": 0.10227520670741796, "epoch": 2.466, "frac_reward_zero_std": 0.0, "grad_norm": 0.17705994844436646, "learning_rate": 1.055110222369828e-06, "loss": 0.0074, "num_tokens": 10747385.0, "reward": 13.770784378051758, "reward_std": 0.546610414981842, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8085607290267944, "rewards/length2tails_reward/std": 0.23666897416114807, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.59375, "completions/mean_terminated_length": 273.59375, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.10402582865208387, "epoch": 2.468, "frac_reward_zero_std": 0.0, "grad_norm": 0.22149960696697235, "learning_rate": 1.0538298434121282e-06, "loss": -0.0019, "num_tokens": 10756172.0, "reward": 13.709596633911133, "reward_std": 0.6485846042633057, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8691390752792358, "rewards/length2tails_reward/std": 0.17218910157680511, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.84375, "completions/mean_terminated_length": 272.84375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09120804257690907, "epoch": 2.4699999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.05785958468914032, "learning_rate": 1.0525493759447763e-06, "loss": -0.0047, "num_tokens": 10764935.0, "reward": 13.47886848449707, "reward_std": 1.4138140678405762, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5382261276245117, "rewards/kidney_reward/std": 0.49923640489578247, "rewards/length2tails_reward/mean": 0.794089674949646, "rewards/length2tails_reward/std": 0.2710154056549072, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.400047779083252, "rewards/thermo_reward/std": 0.9587206840515137, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.9375, "completions/mean_terminated_length": 272.9375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08946628589183092, "epoch": 2.472, "frac_reward_zero_std": 0.0, "grad_norm": 0.10416407883167267, "learning_rate": 1.0512688220731792e-06, "loss": 0.0017, "num_tokens": 10773701.0, "reward": 13.635832786560059, "reward_std": 1.0921016931533813, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8250085115432739, "rewards/length2tails_reward/std": 0.2135208547115326, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.520385980606079, "rewards/thermo_reward/std": 0.9200055599212646, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.5, "completions/mean_terminated_length": 272.5, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08608097489923239, "epoch": 2.474, "frac_reward_zero_std": 0.0, "grad_norm": 0.08293689042329788, "learning_rate": 1.0499881839028866e-06, "loss": -0.008, "num_tokens": 10782453.0, "reward": 13.533278465270996, "reward_std": 1.0921880006790161, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7436509132385254, "rewards/length2tails_reward/std": 0.3194071352481842, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4561173915863037, "rewards/thermo_reward/std": 0.8726389408111572, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 271.71875, "completions/mean_terminated_length": 271.71875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09384459909051657, "epoch": 2.476, "frac_reward_zero_std": 0.0, "grad_norm": 0.11530506610870361, "learning_rate": 1.0487074635395853e-06, "loss": 0.0013, "num_tokens": 10791180.0, "reward": 13.605308532714844, "reward_std": 0.6402997374534607, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7492390871047974, "rewards/length2tails_reward/std": 0.2651563882827759, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.470078706741333, "rewards/thermo_reward/std": 0.5830413699150085, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.6875, "completions/mean_terminated_length": 272.6875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.09741722140461206, "epoch": 2.4779999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.1013156846165657, "learning_rate": 1.0474266630890985e-06, "loss": 0.0001, "num_tokens": 10799938.0, "reward": 13.702683448791504, "reward_std": 1.0358788967132568, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8046396970748901, "rewards/length2tails_reward/std": 0.2169589251279831, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5619144439697266, "rewards/thermo_reward/std": 0.8957966566085815, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 271.65625, "completions/mean_terminated_length": 271.65625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09494194388389587, "epoch": 2.48, "frac_reward_zero_std": 0.0, "grad_norm": 0.16340993344783783, "learning_rate": 1.0461457846573809e-06, "loss": -0.0004, "num_tokens": 10808663.0, "reward": 13.323081970214844, "reward_std": 1.4894757270812988, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7303534746170044, "rewards/length2tails_reward/std": 0.2566344141960144, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1897404193878174, "rewards/thermo_reward/std": 1.404547095298767, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.0, "completions/mean_terminated_length": 273.0, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08389176521450281, "epoch": 2.482, "frac_reward_zero_std": 0.0, "grad_norm": 0.12332534790039062, "learning_rate": 1.044864830350515e-06, "loss": 0.0017, "num_tokens": 10817431.0, "reward": 13.135833740234375, "reward_std": 2.7119650840759277, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.46026611328125, "rewards/kidney_reward/std": 0.7956667542457581, "rewards/length2tails_reward/mean": 0.8329859375953674, "rewards/length2tails_reward/std": 0.22733183205127716, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1885929107666016, "rewards/thermo_reward/std": 1.7483952045440674, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 274.28125, "completions/mean_terminated_length": 274.28125, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.09511914569884539, "epoch": 2.484, "frac_reward_zero_std": 0.0, "grad_norm": 0.08770027756690979, "learning_rate": 1.0435838022747084e-06, "loss": -0.0017, "num_tokens": 10826240.0, "reward": 13.80496883392334, "reward_std": 0.4740360379219055, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.87681645154953, "rewards/length2tails_reward/std": 0.15799346566200256, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.75, "completions/mean_terminated_length": 270.75, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "entropy": 0.08547612745314837, "epoch": 2.4859999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.1876983940601349, "learning_rate": 1.04230270253629e-06, "loss": 0.0002, "num_tokens": 10834936.0, "reward": 13.095527648925781, "reward_std": 2.6350486278533936, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.363231658935547, "rewards/kidney_reward/std": 0.9425392746925354, "rewards/length2tails_reward/mean": 0.7321509122848511, "rewards/length2tails_reward/std": 0.3210483491420746, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2554047107696533, "rewards/thermo_reward/std": 1.5079888105392456, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.75, "completions/mean_terminated_length": 270.75, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08697150368243456, "epoch": 2.488, "frac_reward_zero_std": 0.0, "grad_norm": 0.10575957596302032, "learning_rate": 1.0410215332417065e-06, "loss": -0.0019, "num_tokens": 10843632.0, "reward": 13.676462173461914, "reward_std": 0.9143254160881042, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.6497900485992432, "rewards/length2tails_reward/std": 0.34322085976600647, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5238184928894043, "rewards/thermo_reward/std": 0.9028612971305847, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.75, "completions/mean_terminated_length": 272.75, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09583816025406122, "epoch": 2.49, "frac_reward_zero_std": 0.0, "grad_norm": 0.08811348676681519, "learning_rate": 1.0397402964975186e-06, "loss": 0.0008, "num_tokens": 10852392.0, "reward": 13.367749214172363, "reward_std": 2.2184150218963623, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.4928197860717773, "rewards/kidney_reward/std": 0.7560939788818359, "rewards/length2tails_reward/mean": 0.8037996292114258, "rewards/length2tails_reward/std": 0.2079600840806961, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.390873432159424, "rewards/thermo_reward/std": 1.1930720806121826, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 272.21875, "completions/mean_terminated_length": 272.21875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.0990471001714468, "epoch": 2.492, "frac_reward_zero_std": 0.0, "grad_norm": 0.06559862196445465, "learning_rate": 1.0384589944103983e-06, "loss": -0.0009, "num_tokens": 10861135.0, "reward": 13.631964683532715, "reward_std": 1.0133236646652222, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7975431680679321, "rewards/length2tails_reward/std": 0.2299695611000061, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4645447731018066, "rewards/thermo_reward/std": 1.0127943754196167, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.125, "completions/mean_terminated_length": 273.125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09418590925633907, "epoch": 2.4939999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.1989491581916809, "learning_rate": 1.037177629087125e-06, "loss": -0.0025, "num_tokens": 10869907.0, "reward": 12.421655654907227, "reward_std": 5.143921852111816, "rewards/fitness_reward/mean": 6.928044319152832, "rewards/fitness_reward/std": 2.1392548084259033, "rewards/kidney_reward/mean": 2.222644567489624, "rewards/kidney_reward/std": 1.3976283073425293, "rewards/length2tails_reward/mean": 0.8509255647659302, "rewards/length2tails_reward/std": 0.16113992035388947, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.085874319076538, "rewards/thermo_reward/std": 1.8372138738632202, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.4375, "completions/mean_terminated_length": 272.4375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08448371384292841, "epoch": 2.496, "frac_reward_zero_std": 0.0, "grad_norm": 0.12150036543607712, "learning_rate": 1.0358962026345824e-06, "loss": 0.0041, "num_tokens": 10878657.0, "reward": 13.807670593261719, "reward_std": 0.5155578255653381, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7785651683807373, "rewards/length2tails_reward/std": 0.22352470457553864, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.0625, "completions/mean_terminated_length": 272.0625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "entropy": 0.0876282462850213, "epoch": 2.498, "frac_reward_zero_std": 0.0, "grad_norm": 0.10053152590990067, "learning_rate": 1.0346147171597535e-06, "loss": 0.0021, "num_tokens": 10887395.0, "reward": 13.880105972290039, "reward_std": 0.3773041367530823, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8304663896560669, "rewards/length2tails_reward/std": 0.21791306138038635, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.0, "completions/mean_terminated_length": 272.0, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "entropy": 0.07947520073503256, "epoch": 2.5, "frac_reward_zero_std": 0.0, "grad_norm": 0.15748527646064758, "learning_rate": 1.0333331747697196e-06, "loss": 0.0053, "num_tokens": 10896131.0, "reward": 12.376877784729004, "reward_std": 5.173625469207764, "rewards/fitness_reward/mean": 6.897719383239746, "rewards/fitness_reward/std": 2.001847982406616, "rewards/kidney_reward/mean": 2.1834561824798584, "rewards/kidney_reward/std": 1.338996410369873, "rewards/length2tails_reward/mean": 0.779033362865448, "rewards/length2tails_reward/std": 0.30837929248809814, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1177992820739746, "rewards/thermo_reward/std": 2.124000310897827, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 272.78125, "completions/mean_terminated_length": 272.78125, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.09179925918579102, "epoch": 2.502, "frac_reward_zero_std": 0.0, "grad_norm": 0.1123429536819458, "learning_rate": 1.0320515775716554e-06, "loss": -0.0004, "num_tokens": 10904892.0, "reward": 13.492806434631348, "reward_std": 1.2470368146896362, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.855421245098114, "rewards/length2tails_reward/std": 0.122994564473629, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3743178844451904, "rewards/thermo_reward/std": 1.082673192024231, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.0625, "completions/mean_terminated_length": 273.0625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.0898106126114726, "epoch": 2.504, "frac_reward_zero_std": 0.0, "grad_norm": 0.130186066031456, "learning_rate": 1.0307699276728248e-06, "loss": -0.0052, "num_tokens": 10913662.0, "reward": 13.603492736816406, "reward_std": 1.795384407043457, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.512298583984375, "rewards/kidney_reward/std": 0.6459053158760071, "rewards/length2tails_reward/mean": 0.7948645353317261, "rewards/length2tails_reward/std": 0.2649412453174591, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.550522804260254, "rewards/thermo_reward/std": 1.1532329320907593, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.09375, "completions/mean_terminated_length": 272.09375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09738536924123764, "epoch": 2.5060000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.12294717133045197, "learning_rate": 1.0294882271805798e-06, "loss": 0.0041, "num_tokens": 10922401.0, "reward": 13.470455169677734, "reward_std": 1.1227214336395264, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7700117826461792, "rewards/length2tails_reward/std": 0.24589170515537262, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3605072498321533, "rewards/thermo_reward/std": 0.9652195572853088, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.53125, "completions/mean_terminated_length": 273.53125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09221231192350388, "epoch": 2.508, "frac_reward_zero_std": 0.0, "grad_norm": 0.10390887409448624, "learning_rate": 1.0282064782023544e-06, "loss": 0.0041, "num_tokens": 10931186.0, "reward": 13.456817626953125, "reward_std": 2.2660326957702637, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.4679338932037354, "rewards/kidney_reward/std": 0.7531667947769165, "rewards/length2tails_reward/mean": 0.8385056257247925, "rewards/length2tails_reward/std": 0.21479752659797668, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.501357316970825, "rewards/thermo_reward/std": 1.2150810956954956, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/max_terminated_length": 644.0, "completions/mean_length": 282.875, "completions/mean_terminated_length": 282.875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0956075107678771, "epoch": 2.51, "frac_reward_zero_std": 0.0, "grad_norm": 0.19084641337394714, "learning_rate": 1.026924682845663e-06, "loss": -0.0193, "num_tokens": 10940270.0, "reward": 13.905851364135742, "reward_std": 0.32790374755859375, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.6890615224838257, "rewards/length2tails_reward/std": 0.3036077320575714, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.71875, "completions/mean_terminated_length": 272.71875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08849367219954729, "epoch": 2.512, "frac_reward_zero_std": 0.0, "grad_norm": 0.09959346055984497, "learning_rate": 1.0256428432180954e-06, "loss": 0.0044, "num_tokens": 10949029.0, "reward": 13.916118621826172, "reward_std": 0.3095528781414032, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7917271852493286, "rewards/length2tails_reward/std": 0.2443895936012268, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.8125, "completions/mean_terminated_length": 271.8125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09578220918774605, "epoch": 2.5140000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.06927625834941864, "learning_rate": 1.0243609614273155e-06, "loss": -0.0016, "num_tokens": 10957759.0, "reward": 13.563721656799316, "reward_std": 1.1785898208618164, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.755905032157898, "rewards/length2tails_reward/std": 0.2580097019672394, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.427825450897217, "rewards/thermo_reward/std": 1.1418836116790771, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.46875, "completions/mean_terminated_length": 271.46875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08337361365556717, "epoch": 2.516, "frac_reward_zero_std": 0.0, "grad_norm": 0.1192711740732193, "learning_rate": 1.0230790395810554e-06, "loss": -0.0049, "num_tokens": 10966478.0, "reward": 13.7611665725708, "reward_std": 0.8724231123924255, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.6928871273994446, "rewards/length2tails_reward/std": 0.27702251076698303, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.604213237762451, "rewards/thermo_reward/std": 0.8649076819419861, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.84375, "completions/mean_terminated_length": 271.84375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.0807227585464716, "epoch": 2.518, "frac_reward_zero_std": 0.0, "grad_norm": 0.0811200961470604, "learning_rate": 1.0217970797871138e-06, "loss": -0.0054, "num_tokens": 10975209.0, "reward": 12.884984970092773, "reward_std": 3.795445442199707, "rewards/fitness_reward/mean": 6.971607208251953, "rewards/fitness_reward/std": 1.8960589170455933, "rewards/kidney_reward/mean": 2.390123128890991, "rewards/kidney_reward/std": 0.8301148414611816, "rewards/length2tails_reward/mean": 0.7773324847221375, "rewards/length2tails_reward/std": 0.2632235586643219, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3455216884613037, "rewards/thermo_reward/std": 1.3048064708709717, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.1875, "completions/mean_terminated_length": 273.1875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.0955256512388587, "epoch": 2.52, "frac_reward_zero_std": 0.0, "grad_norm": 0.10693230479955673, "learning_rate": 1.0205150841533512e-06, "loss": 0.0026, "num_tokens": 10983983.0, "reward": 13.840425491333008, "reward_std": 0.42499473690986633, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8325204849243164, "rewards/length2tails_reward/std": 0.2075044959783554, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 271.0, "completions/mean_terminated_length": 271.0, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08668213989585638, "epoch": 2.5220000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.06507860124111176, "learning_rate": 1.019233054787687e-06, "loss": -0.0003, "num_tokens": 10992687.0, "reward": 13.750646591186523, "reward_std": 0.9315276145935059, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7100703120231628, "rewards/length2tails_reward/std": 0.3212975859642029, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5919742584228516, "rewards/thermo_reward/std": 0.9297705888748169, "step": 1261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.3125, "completions/mean_terminated_length": 272.3125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09384338092058897, "epoch": 2.524, "frac_reward_zero_std": 0.0, "grad_norm": 0.07486650347709656, "learning_rate": 1.0179509937980971e-06, "loss": -0.0034, "num_tokens": 11001433.0, "reward": 13.481481552124023, "reward_std": 1.5638045072555542, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7633057832717896, "rewards/length2tails_reward/std": 0.27240240573883057, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3722052574157715, "rewards/thermo_reward/std": 1.3520067930221558, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.84375, "completions/mean_terminated_length": 272.84375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0895926021039486, "epoch": 2.526, "frac_reward_zero_std": 0.0, "grad_norm": 0.08937916904687881, "learning_rate": 1.0166689032926083e-06, "loss": -0.0036, "num_tokens": 11010196.0, "reward": 13.441423416137695, "reward_std": 1.288163661956787, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7962145805358887, "rewards/length2tails_reward/std": 0.2610096335411072, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.328855514526367, "rewards/thermo_reward/std": 1.1160696744918823, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.625, "completions/mean_terminated_length": 273.625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08504704385995865, "epoch": 2.528, "frac_reward_zero_std": 0.0, "grad_norm": 0.1109505146741867, "learning_rate": 1.0153867853792966e-06, "loss": -0.0011, "num_tokens": 11018984.0, "reward": 13.530813217163086, "reward_std": 1.5682084560394287, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5330631732940674, "rewards/kidney_reward/std": 0.5284431576728821, "rewards/length2tails_reward/mean": 0.8431212306022644, "rewards/length2tails_reward/std": 0.22412942349910736, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4522523880004883, "rewards/thermo_reward/std": 1.0748932361602783, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.15625, "completions/mean_terminated_length": 272.15625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08961287420243025, "epoch": 2.5300000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.3546622693538666, "learning_rate": 1.014104642166282e-06, "loss": 0.0012, "num_tokens": 11027725.0, "reward": 12.969375610351562, "reward_std": 4.262889862060547, "rewards/fitness_reward/mean": 7.0060319900512695, "rewards/fitness_reward/std": 2.0090479850769043, "rewards/kidney_reward/mean": 2.4171500205993652, "rewards/kidney_reward/std": 1.0360020399093628, "rewards/length2tails_reward/mean": 0.7523579597473145, "rewards/length2tails_reward/std": 0.25718116760253906, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.370958089828491, "rewards/thermo_reward/std": 1.2960481643676758, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 274.59375, "completions/mean_terminated_length": 274.59375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09319291729480028, "epoch": 2.532, "frac_reward_zero_std": 0.0, "grad_norm": 0.06620564311742783, "learning_rate": 1.0128224757617272e-06, "loss": -0.0063, "num_tokens": 11036544.0, "reward": 12.781671524047852, "reward_std": 3.350013256072998, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.457012176513672, "rewards/kidney_reward/std": 0.5447914004325867, "rewards/length2tails_reward/mean": 0.8345171213150024, "rewards/length2tails_reward/std": 0.26822420954704285, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0881545543670654, "rewards/thermo_reward/std": 1.6181961297988892, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.625, "completions/mean_terminated_length": 273.625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08962809201329947, "epoch": 2.534, "frac_reward_zero_std": 0.0, "grad_norm": 0.11474078893661499, "learning_rate": 1.0115402882738333e-06, "loss": -0.0023, "num_tokens": 11045332.0, "reward": 13.674827575683594, "reward_std": 0.8290287256240845, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8229432106018066, "rewards/length2tails_reward/std": 0.2556893229484558, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.1875, "completions/mean_terminated_length": 270.1875, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "entropy": 0.09624328929930925, "epoch": 2.536, "frac_reward_zero_std": 0.0, "grad_norm": 0.09320349991321564, "learning_rate": 1.0102580818108345e-06, "loss": 0.0033, "num_tokens": 11054010.0, "reward": 13.636301040649414, "reward_std": 1.0296601057052612, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7002662420272827, "rewards/length2tails_reward/std": 0.33832138776779175, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5059685707092285, "rewards/thermo_reward/std": 0.9929302930831909, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.375, "completions/mean_terminated_length": 273.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09368395432829857, "epoch": 2.5380000000000003, "frac_reward_zero_std": 0.0, "grad_norm": 0.1153569370508194, "learning_rate": 1.0089758584809977e-06, "loss": -0.0002, "num_tokens": 11062790.0, "reward": 13.67737102508545, "reward_std": 0.5592531561851501, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7974194288253784, "rewards/length2tails_reward/std": 0.2665257453918457, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5099644660949707, "rewards/thermo_reward/std": 0.5615194439888, "step": 1269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.0625, "completions/mean_terminated_length": 273.0625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09078614320605993, "epoch": 2.54, "frac_reward_zero_std": 0.0, "grad_norm": 0.10006042569875717, "learning_rate": 1.0076936203926172e-06, "loss": 0.0046, "num_tokens": 11071560.0, "reward": 13.719049453735352, "reward_std": 0.5301154255867004, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8153437972068787, "rewards/length2tails_reward/std": 0.2255038619041443, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5498504638671875, "rewards/thermo_reward/std": 0.5360844731330872, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.03125, "completions/mean_terminated_length": 272.03125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08808152377605438, "epoch": 2.542, "frac_reward_zero_std": 0.0, "grad_norm": 0.09218467026948929, "learning_rate": 1.0064113696540111e-06, "loss": -0.0015, "num_tokens": 11080297.0, "reward": 13.561868667602539, "reward_std": 1.4590805768966675, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7816716432571411, "rewards/length2tails_reward/std": 0.22780533134937286, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.450754165649414, "rewards/thermo_reward/std": 1.281867265701294, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.6875, "completions/mean_terminated_length": 273.6875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09965451620519161, "epoch": 2.544, "frac_reward_zero_std": 0.0, "grad_norm": 0.08999935537576675, "learning_rate": 1.0051291083735183e-06, "loss": -0.0012, "num_tokens": 11089087.0, "reward": 13.493375778198242, "reward_std": 0.9772318601608276, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.842260479927063, "rewards/length2tails_reward/std": 0.22625687718391418, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3762035369873047, "rewards/thermo_reward/std": 0.8942775130271912, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.1875, "completions/mean_terminated_length": 272.1875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09344311617314816, "epoch": 2.5460000000000003, "frac_reward_zero_std": 0.0, "grad_norm": 0.14794650673866272, "learning_rate": 1.0038468386594957e-06, "loss": 0.0025, "num_tokens": 11097829.0, "reward": 13.48156452178955, "reward_std": 1.4521809816360474, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7624658346176147, "rewards/length2tails_reward/std": 0.25733861327171326, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.345012664794922, "rewards/thermo_reward/std": 1.326688289642334, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "entropy": 0.09371494967490435, "epoch": 2.548, "frac_reward_zero_std": 0.0, "grad_norm": 0.0943506732583046, "learning_rate": 1.0025645626203135e-06, "loss": 0.0035, "num_tokens": 11106593.0, "reward": 12.683286666870117, "reward_std": 4.46260404586792, "rewards/fitness_reward/mean": 6.967517852783203, "rewards/fitness_reward/std": 1.9188534021377563, "rewards/kidney_reward/mean": 2.366368055343628, "rewards/kidney_reward/std": 1.0303393602371216, "rewards/length2tails_reward/mean": 0.8446621894836426, "rewards/length2tails_reward/std": 0.20369280874729156, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1649341583251953, "rewards/thermo_reward/std": 1.7017467021942139, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 272.625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09274380747228861, "epoch": 2.55, "frac_reward_zero_std": 0.0, "grad_norm": 0.0775628462433815, "learning_rate": 1.0012822823643522e-06, "loss": -0.0063, "num_tokens": 11115349.0, "reward": 13.795833587646484, "reward_std": 0.4870997369289398, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7854651212692261, "rewards/length2tails_reward/std": 0.28153514862060547, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 274.0625, "completions/mean_terminated_length": 274.0625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09395697340369225, "epoch": 2.552, "frac_reward_zero_std": 0.0, "grad_norm": 0.5975285768508911, "learning_rate": 1e-06, "loss": -0.0071, "num_tokens": 11124151.0, "reward": 13.259191513061523, "reward_std": 2.373781442642212, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.4164295196533203, "rewards/kidney_reward/std": 0.8949340581893921, "rewards/length2tails_reward/mean": 0.877596378326416, "rewards/length2tails_reward/std": 0.1836080551147461, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3513259887695312, "rewards/thermo_reward/std": 1.1975548267364502, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.28125, "completions/mean_terminated_length": 273.28125, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.09055192023515701, "epoch": 2.5540000000000003, "frac_reward_zero_std": 0.0, "grad_norm": 0.09261415898799896, "learning_rate": 9.987177176356477e-07, "loss": 0.0016, "num_tokens": 11132928.0, "reward": 13.560474395751953, "reward_std": 1.0087324380874634, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8390970826148987, "rewards/length2tails_reward/std": 0.1651468575000763, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4162583351135254, "rewards/thermo_reward/std": 0.8840147256851196, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.15625, "completions/mean_terminated_length": 273.15625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09465023316442966, "epoch": 2.556, "frac_reward_zero_std": 0.0, "grad_norm": 0.1217871680855751, "learning_rate": 9.974354373796866e-07, "loss": 0.0025, "num_tokens": 11141701.0, "reward": 13.690476417541504, "reward_std": 0.6046956181526184, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8032035827636719, "rewards/length2tails_reward/std": 0.23792394995689392, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5498507022857666, "rewards/thermo_reward/std": 0.5360844731330872, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.5, "completions/mean_terminated_length": 272.5, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0930365202948451, "epoch": 2.558, "frac_reward_zero_std": 0.0, "grad_norm": 0.10494538396596909, "learning_rate": 9.961531613405042e-07, "loss": -0.0046, "num_tokens": 11150453.0, "reward": 13.534183502197266, "reward_std": 1.6083168983459473, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7721899151802063, "rewards/length2tails_reward/std": 0.2945183515548706, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4240174293518066, "rewards/thermo_reward/std": 1.4253337383270264, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09113780409097672, "epoch": 2.56, "frac_reward_zero_std": 0.0, "grad_norm": 0.10989918559789658, "learning_rate": 9.948708916264816e-07, "loss": -0.0034, "num_tokens": 11159181.0, "reward": 13.42755126953125, "reward_std": 1.8706536293029785, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5374107360839844, "rewards/kidney_reward/std": 0.503849983215332, "rewards/length2tails_reward/mean": 0.7447144389152527, "rewards/length2tails_reward/std": 0.2845161557197571, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3544845581054688, "rewards/thermo_reward/std": 1.3823689222335815, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 273.0625, "completions/mean_terminated_length": 273.0625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09468003083020449, "epoch": 2.5620000000000003, "frac_reward_zero_std": 0.0, "grad_norm": 0.09075962752103806, "learning_rate": 9.935886303459888e-07, "loss": 0.0029, "num_tokens": 11167951.0, "reward": 13.570971488952637, "reward_std": 1.215386986732483, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8060849905014038, "rewards/length2tails_reward/std": 0.2591499388217926, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4574167728424072, "rewards/thermo_reward/std": 1.0486905574798584, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08730126172304153, "epoch": 2.564, "frac_reward_zero_std": 0.0, "grad_norm": 0.10445868223905563, "learning_rate": 9.923063796073825e-07, "loss": -0.0011, "num_tokens": 11176699.0, "reward": 12.568877220153809, "reward_std": 5.231773376464844, "rewards/fitness_reward/mean": 6.651214122772217, "rewards/fitness_reward/std": 2.7942206859588623, "rewards/kidney_reward/mean": 2.4094700813293457, "rewards/kidney_reward/std": 1.0789785385131836, "rewards/length2tails_reward/mean": 0.7707823514938354, "rewards/length2tails_reward/std": 0.26711055636405945, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3311150074005127, "rewards/thermo_reward/std": 1.571403980255127, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 272.625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0821879762224853, "epoch": 2.566, "frac_reward_zero_std": 0.0, "grad_norm": 0.14877688884735107, "learning_rate": 9.91024141519002e-07, "loss": 0.0087, "num_tokens": 11185455.0, "reward": 13.497551918029785, "reward_std": 1.3498157262802124, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7818381190299988, "rewards/length2tails_reward/std": 0.25987908244132996, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3864216804504395, "rewards/thermo_reward/std": 1.1549537181854248, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.9375, "completions/mean_terminated_length": 271.9375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09241542033851147, "epoch": 2.568, "frac_reward_zero_std": 0.0, "grad_norm": 0.22466203570365906, "learning_rate": 9.897419181891654e-07, "loss": -0.0002, "num_tokens": 11194189.0, "reward": 13.665223121643066, "reward_std": 1.5177754163742065, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.495814561843872, "rewards/kidney_reward/std": 0.5996876955032349, "rewards/length2tails_reward/mean": 0.7762358784675598, "rewards/length2tails_reward/std": 0.24603904783725739, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.6305999755859375, "rewards/thermo_reward/std": 0.917843759059906, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.875, "completions/mean_terminated_length": 271.875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08896076306700706, "epoch": 2.57, "frac_reward_zero_std": 0.0, "grad_norm": 0.08401848375797272, "learning_rate": 9.884597117261666e-07, "loss": -0.0019, "num_tokens": 11202921.0, "reward": 13.83401870727539, "reward_std": 0.4327254891395569, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7684556245803833, "rewards/length2tails_reward/std": 0.253930002450943, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 271.3125, "completions/mean_terminated_length": 271.3125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09406418073922396, "epoch": 2.572, "frac_reward_zero_std": 0.0, "grad_norm": 0.10037721693515778, "learning_rate": 9.871775242382725e-07, "loss": 0.0042, "num_tokens": 11211635.0, "reward": 13.790679931640625, "reward_std": 0.4637486934661865, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7339187860488892, "rewards/length2tails_reward/std": 0.26712721586227417, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.84375, "completions/mean_terminated_length": 271.84375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08917554654181004, "epoch": 2.574, "frac_reward_zero_std": 0.0, "grad_norm": 0.1197580024600029, "learning_rate": 9.85895357833718e-07, "loss": 0.0004, "num_tokens": 11220366.0, "reward": 12.90008544921875, "reward_std": 4.444961071014404, "rewards/fitness_reward/mean": 6.947967052459717, "rewards/fitness_reward/std": 2.0279359817504883, "rewards/kidney_reward/mean": 2.416769504547119, "rewards/kidney_reward/std": 1.0381296873092651, "rewards/length2tails_reward/mean": 0.7331525087356567, "rewards/length2tails_reward/std": 0.3075205981731415, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.362034320831299, "rewards/thermo_reward/std": 1.5685944557189941, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.6875, "completions/mean_terminated_length": 271.6875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.07669658865779638, "epoch": 2.576, "frac_reward_zero_std": 0.0, "grad_norm": 0.2233160138130188, "learning_rate": 9.846132146207038e-07, "loss": -0.008, "num_tokens": 11229092.0, "reward": 12.308110237121582, "reward_std": 4.719709396362305, "rewards/fitness_reward/mean": 6.687413215637207, "rewards/fitness_reward/std": 2.4316930770874023, "rewards/kidney_reward/mean": 2.266258478164673, "rewards/kidney_reward/std": 1.1048941612243652, "rewards/length2tails_reward/mean": 0.6728297472000122, "rewards/length2tails_reward/std": 0.375557005405426, "rewards/repeated_in_batch_reward/mean": 0.9375, "rewards/repeated_in_batch_reward/std": 0.24593468010425568, "rewards/thermo_reward/mean": 3.193406105041504, "rewards/thermo_reward/std": 1.554321527481079, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.0, "completions/mean_terminated_length": 273.0, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.0911758104339242, "epoch": 2.578, "frac_reward_zero_std": 0.0, "grad_norm": 0.07548670470714569, "learning_rate": 9.833310967073918e-07, "loss": -0.0013, "num_tokens": 11237860.0, "reward": 13.564449310302734, "reward_std": 1.4019521474838257, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8588228225708008, "rewards/length2tails_reward/std": 0.21809880435466766, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.418262481689453, "rewards/thermo_reward/std": 1.2506855726242065, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.1875, "completions/mean_terminated_length": 269.1875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.09383352566510439, "epoch": 2.58, "frac_reward_zero_std": 0.0, "grad_norm": 0.6485643982887268, "learning_rate": 9.82049006201903e-07, "loss": -0.0941, "num_tokens": 11246506.0, "reward": 12.823418617248535, "reward_std": 4.95705509185791, "rewards/fitness_reward/mean": 6.976078033447266, "rewards/fitness_reward/std": 2.1784961223602295, "rewards/kidney_reward/mean": 2.378368854522705, "rewards/kidney_reward/std": 1.2533495426177979, "rewards/length2tails_reward/mean": 0.8960451483726501, "rewards/length2tails_reward/std": 0.13475503027439117, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2793679237365723, "rewards/thermo_reward/std": 1.744246244430542, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.71875, "completions/mean_terminated_length": 273.71875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08943956810981035, "epoch": 2.582, "frac_reward_zero_std": 0.0, "grad_norm": 0.11884415149688721, "learning_rate": 9.807669452123129e-07, "loss": 0.0007, "num_tokens": 11255297.0, "reward": 13.767016410827637, "reward_std": 1.1081784963607788, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8364322781562805, "rewards/length2tails_reward/std": 0.21595613658428192, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.6230673789978027, "rewards/thermo_reward/std": 0.9592058062553406, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.21875, "completions/mean_terminated_length": 273.21875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.09474212862551212, "epoch": 2.584, "frac_reward_zero_std": 0.0, "grad_norm": 0.08420742303133011, "learning_rate": 9.794849158466492e-07, "loss": -0.0018, "num_tokens": 11264072.0, "reward": 13.596220970153809, "reward_std": 1.322066307067871, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8675425052642822, "rewards/length2tails_reward/std": 0.1385192573070526, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.476520538330078, "rewards/thermo_reward/std": 1.1455049514770508, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.59375, "completions/mean_terminated_length": 272.59375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08064570464193821, "epoch": 2.586, "frac_reward_zero_std": 0.0, "grad_norm": 0.08840952068567276, "learning_rate": 9.782029202128863e-07, "loss": 0.0042, "num_tokens": 11272827.0, "reward": 13.258116722106934, "reward_std": 2.666351079940796, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.650642991065979, "rewards/kidney_reward/mean": 2.459886074066162, "rewards/kidney_reward/std": 0.7977776527404785, "rewards/length2tails_reward/mean": 0.7825390100479126, "rewards/length2tails_reward/std": 0.2645307183265686, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.373810291290283, "rewards/thermo_reward/std": 1.3559972047805786, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.4375, "completions/mean_terminated_length": 271.4375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08870595414191484, "epoch": 2.588, "frac_reward_zero_std": 0.0, "grad_norm": 0.10983943194150925, "learning_rate": 9.769209604189447e-07, "loss": -0.006, "num_tokens": 11281545.0, "reward": 13.830974578857422, "reward_std": 0.43895232677459717, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7380162477493286, "rewards/length2tails_reward/std": 0.2492593377828598, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 282.59375, "completions/mean_terminated_length": 282.59375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09890826977789402, "epoch": 2.59, "frac_reward_zero_std": 0.0, "grad_norm": 1.069254994392395, "learning_rate": 9.756390385726847e-07, "loss": -0.0119, "num_tokens": 11290620.0, "reward": 13.273027420043945, "reward_std": 2.208266496658325, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.453822135925293, "rewards/kidney_reward/std": 0.6914807558059692, "rewards/length2tails_reward/mean": 0.7368823885917664, "rewards/length2tails_reward/std": 0.28326600790023804, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.341841220855713, "rewards/thermo_reward/std": 1.2458171844482422, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.78125, "completions/mean_terminated_length": 271.78125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08741918485611677, "epoch": 2.592, "frac_reward_zero_std": 0.0, "grad_norm": 0.08522574603557587, "learning_rate": 9.743571567819045e-07, "loss": -0.0004, "num_tokens": 11299349.0, "reward": 13.694768905639648, "reward_std": 0.9248803853988647, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.6994171142578125, "rewards/length2tails_reward/std": 0.281656950712204, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5645222663879395, "rewards/thermo_reward/std": 0.8824394345283508, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 274.625, "completions/mean_terminated_length": 274.625, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "entropy": 0.09598539769649506, "epoch": 2.594, "frac_reward_zero_std": 0.0, "grad_norm": 0.09700886160135269, "learning_rate": 9.730753171543374e-07, "loss": 0.0012, "num_tokens": 11308169.0, "reward": 13.684208869934082, "reward_std": 0.8987782001495361, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.9033083319664001, "rewards/length2tails_reward/std": 0.13247719407081604, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5335729122161865, "rewards/thermo_reward/std": 0.8546720743179321, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.3125, "completions/mean_terminated_length": 272.3125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09415233507752419, "epoch": 2.596, "frac_reward_zero_std": 0.0, "grad_norm": 0.19626864790916443, "learning_rate": 9.717935217976457e-07, "loss": -0.0005, "num_tokens": 11316915.0, "reward": 13.699674606323242, "reward_std": 0.6537702679634094, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.769914984703064, "rewards/length2tails_reward/std": 0.2710944712162018, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.5625, "completions/mean_terminated_length": 272.5625, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "entropy": 0.09926941432058811, "epoch": 2.598, "frac_reward_zero_std": 0.0, "grad_norm": 0.1721256971359253, "learning_rate": 9.7051177281942e-07, "loss": -0.0007, "num_tokens": 11325669.0, "reward": 13.715205192565918, "reward_std": 0.5400824546813965, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7769005298614502, "rewards/length2tails_reward/std": 0.29632213711738586, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5498504638671875, "rewards/thermo_reward/std": 0.5360844731330872, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.0625, "completions/mean_terminated_length": 272.0625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09105388261377811, "epoch": 2.6, "frac_reward_zero_std": 0.0, "grad_norm": 0.09366677701473236, "learning_rate": 9.692300723271752e-07, "loss": -0.0025, "num_tokens": 11334407.0, "reward": 12.53223705291748, "reward_std": 4.189420223236084, "rewards/fitness_reward/mean": 6.938035011291504, "rewards/fitness_reward/std": 1.840762972831726, "rewards/kidney_reward/mean": 2.2998170852661133, "rewards/kidney_reward/std": 1.048743724822998, "rewards/length2tails_reward/mean": 0.7610618472099304, "rewards/length2tails_reward/std": 0.30591100454330444, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1182785034179688, "rewards/thermo_reward/std": 1.7776644229888916, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.1875, "completions/mean_terminated_length": 273.1875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09142200648784637, "epoch": 2.602, "frac_reward_zero_std": 0.0, "grad_norm": 0.10108436644077301, "learning_rate": 9.679484224283447e-07, "loss": -0.0024, "num_tokens": 11343181.0, "reward": 13.734735488891602, "reward_std": 0.5882219076156616, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8469273447990417, "rewards/length2tails_reward/std": 0.2233157604932785, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897367000579834, "rewards/thermo_reward/std": 0.5061467885971069, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.375, "completions/mean_terminated_length": 270.375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "entropy": 0.09068982116878033, "epoch": 2.604, "frac_reward_zero_std": 0.0, "grad_norm": 0.15013930201530457, "learning_rate": 9.666668252302806e-07, "loss": 0.0007, "num_tokens": 11351865.0, "reward": 12.603252410888672, "reward_std": 5.028195381164551, "rewards/fitness_reward/mean": 6.663951873779297, "rewards/fitness_reward/std": 2.751307964324951, "rewards/kidney_reward/mean": 2.4275808334350586, "rewards/kidney_reward/std": 0.9776963591575623, "rewards/length2tails_reward/mean": 0.7178422808647156, "rewards/length2tails_reward/std": 0.3161407709121704, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.339935779571533, "rewards/thermo_reward/std": 1.525055170059204, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.8125, "completions/mean_terminated_length": 272.8125, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 0.08908124640583992, "epoch": 2.606, "frac_reward_zero_std": 0.0, "grad_norm": 0.09386913478374481, "learning_rate": 9.653852828402466e-07, "loss": -0.0004, "num_tokens": 11360627.0, "reward": 13.692395210266113, "reward_std": 0.9287048578262329, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8492799401283264, "rewards/length2tails_reward/std": 0.2031726986169815, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5198025703430176, "rewards/thermo_reward/std": 0.9229288101196289, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08594439877197146, "epoch": 2.608, "frac_reward_zero_std": 0.0, "grad_norm": 0.10545476526021957, "learning_rate": 9.641037973654178e-07, "loss": -0.003, "num_tokens": 11369375.0, "reward": 13.526748657226562, "reward_std": 1.576993703842163, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.535891532897949, "rewards/kidney_reward/std": 0.5124428272247314, "rewards/length2tails_reward/mean": 0.7894778251647949, "rewards/length2tails_reward/std": 0.26516595482826233, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4507243633270264, "rewards/thermo_reward/std": 1.1982841491699219, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09429272636771202, "epoch": 2.61, "frac_reward_zero_std": 0.0, "grad_norm": 0.1060292050242424, "learning_rate": 9.628223709128749e-07, "loss": 0.0014, "num_tokens": 11378123.0, "reward": 13.520185470581055, "reward_std": 1.6307603120803833, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.78755122423172, "rewards/length2tails_reward/std": 0.24123510718345642, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.381124258041382, "rewards/thermo_reward/std": 1.503252625465393, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 274.59375, "completions/mean_terminated_length": 274.59375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08381335902959108, "epoch": 2.612, "frac_reward_zero_std": 0.0, "grad_norm": 0.1200776994228363, "learning_rate": 9.615410055896014e-07, "loss": 0.001, "num_tokens": 11386942.0, "reward": 12.965974807739258, "reward_std": 3.969346761703491, "rewards/fitness_reward/mean": 7.0461039543151855, "rewards/fitness_reward/std": 1.7823677062988281, "rewards/kidney_reward/mean": 2.481417179107666, "rewards/kidney_reward/std": 0.8205979466438293, "rewards/length2tails_reward/mean": 0.8177987337112427, "rewards/length2tails_reward/std": 0.2320345640182495, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2566728591918945, "rewards/thermo_reward/std": 1.5435835123062134, "step": 1306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "entropy": 0.08833951037377119, "epoch": 2.614, "frac_reward_zero_std": 0.0, "grad_norm": 0.08464069664478302, "learning_rate": 9.602597035024815e-07, "loss": -0.004, "num_tokens": 11395670.0, "reward": 13.229515075683594, "reward_std": 1.6974692344665527, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8088982105255127, "rewards/length2tails_reward/std": 0.26726973056793213, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0883195400238037, "rewards/thermo_reward/std": 1.6101175546646118, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.5, "completions/mean_terminated_length": 271.5, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "entropy": 0.09514253214001656, "epoch": 2.616, "frac_reward_zero_std": 0.0, "grad_norm": 0.0928158089518547, "learning_rate": 9.589784667582934e-07, "loss": -0.0015, "num_tokens": 11404390.0, "reward": 13.878170013427734, "reward_std": 0.3858002722263336, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8111158013343811, "rewards/length2tails_reward/std": 0.23564399778842926, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 271.875, "completions/mean_terminated_length": 271.875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.08827656414359808, "epoch": 2.618, "frac_reward_zero_std": 0.0, "grad_norm": 0.09488391876220703, "learning_rate": 9.576972974637097e-07, "loss": 0.0011, "num_tokens": 11413122.0, "reward": 13.837629318237305, "reward_std": 0.42670488357543945, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8045660853385925, "rewards/length2tails_reward/std": 0.18262574076652527, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.59375, "completions/mean_terminated_length": 272.59375, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.0957744549959898, "epoch": 2.62, "frac_reward_zero_std": 0.0, "grad_norm": 0.283891499042511, "learning_rate": 9.564161977252915e-07, "loss": 0.0002, "num_tokens": 11421877.0, "reward": 13.834540367126465, "reward_std": 0.43187758326530457, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.773668646812439, "rewards/length2tails_reward/std": 0.24948285520076752, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.21875, "completions/mean_terminated_length": 272.21875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08740524481981993, "epoch": 2.622, "frac_reward_zero_std": 0.0, "grad_norm": 0.08296260982751846, "learning_rate": 9.551351696494853e-07, "loss": -0.0021, "num_tokens": 11430620.0, "reward": 13.915056228637695, "reward_std": 0.31537458300590515, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7811083793640137, "rewards/length2tails_reward/std": 0.24637141823768616, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.34375, "completions/mean_terminated_length": 272.34375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09013056010007858, "epoch": 2.624, "frac_reward_zero_std": 0.0, "grad_norm": 0.1346326768398285, "learning_rate": 9.538542153426195e-07, "loss": -0.006, "num_tokens": 11439367.0, "reward": 13.187000274658203, "reward_std": 2.221527099609375, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.3923935890197754, "rewards/kidney_reward/std": 0.8301119208335876, "rewards/length2tails_reward/mean": 0.7555396556854248, "rewards/length2tails_reward/std": 0.29163092374801636, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3153765201568604, "rewards/thermo_reward/std": 1.1827013492584229, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.96875, "completions/mean_terminated_length": 271.96875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08751111663877964, "epoch": 2.626, "frac_reward_zero_std": 0.0, "grad_norm": 0.08335331082344055, "learning_rate": 9.525733369109017e-07, "loss": -0.0019, "num_tokens": 11448102.0, "reward": 13.484519004821777, "reward_std": 1.0472941398620605, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7206494808197021, "rewards/length2tails_reward/std": 0.29937759041786194, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3521485328674316, "rewards/thermo_reward/std": 1.0041559934616089, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0922544403001666, "epoch": 2.628, "frac_reward_zero_std": 0.0, "grad_norm": 0.058741435408592224, "learning_rate": 9.512925364604151e-07, "loss": -0.0056, "num_tokens": 11456850.0, "reward": 13.344647407531738, "reward_std": 3.008349657058716, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.5390896797180176, "rewards/kidney_reward/std": 0.49435171484947205, "rewards/length2tails_reward/mean": 0.7650218605995178, "rewards/length2tails_reward/std": 0.2936790883541107, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5760021209716797, "rewards/thermo_reward/std": 0.8242037296295166, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.21875, "completions/mean_terminated_length": 273.21875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.0923885004594922, "epoch": 2.63, "frac_reward_zero_std": 0.0, "grad_norm": 0.08085627108812332, "learning_rate": 9.500118160971138e-07, "loss": -0.0054, "num_tokens": 11465625.0, "reward": 13.880472183227539, "reward_std": 0.38467854261398315, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8341273069381714, "rewards/length2tails_reward/std": 0.19471846520900726, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.59375, "completions/mean_terminated_length": 271.59375, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "entropy": 0.10310139693319798, "epoch": 2.632, "frac_reward_zero_std": 0.0, "grad_norm": 0.07032930105924606, "learning_rate": 9.487311779268209e-07, "loss": -0.0031, "num_tokens": 11474348.0, "reward": 13.608085632324219, "reward_std": 1.227725625038147, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7867770791053772, "rewards/length2tails_reward/std": 0.22749637067317963, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4964609146118164, "rewards/thermo_reward/std": 1.0417221784591675, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.34375, "completions/mean_terminated_length": 272.34375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0867899302393198, "epoch": 2.634, "frac_reward_zero_std": 0.0, "grad_norm": 0.09172943234443665, "learning_rate": 9.474506240552238e-07, "loss": -0.0067, "num_tokens": 11483095.0, "reward": 13.61172103881836, "reward_std": 1.7407996654510498, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.537740707397461, "rewards/kidney_reward/std": 0.5019829869270325, "rewards/length2tails_reward/mean": 0.7726287841796875, "rewards/length2tails_reward/std": 0.2952404320240021, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.535531759262085, "rewards/thermo_reward/std": 1.235055685043335, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 272.34375, "completions/mean_terminated_length": 272.34375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.0941666942089796, "epoch": 2.636, "frac_reward_zero_std": 0.0, "grad_norm": 0.09857843071222305, "learning_rate": 9.461701565878718e-07, "loss": -0.003, "num_tokens": 11491842.0, "reward": 13.678802490234375, "reward_std": 0.564246416091919, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8117349147796631, "rewards/length2tails_reward/std": 0.22255975008010864, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.50996470451355, "rewards/thermo_reward/std": 0.5615194439888, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.40625, "completions/mean_terminated_length": 272.40625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09366055484861135, "epoch": 2.638, "frac_reward_zero_std": 0.0, "grad_norm": 0.11917433142662048, "learning_rate": 9.448897776301721e-07, "loss": -0.0, "num_tokens": 11500591.0, "reward": 13.591472625732422, "reward_std": 1.2057130336761475, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7693405747413635, "rewards/length2tails_reward/std": 0.2404543161392212, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4542322158813477, "rewards/thermo_reward/std": 1.0648303031921387, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.0, "completions/mean_terminated_length": 273.0, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.08513898495584726, "epoch": 2.64, "frac_reward_zero_std": 0.0, "grad_norm": 0.1232292652130127, "learning_rate": 9.436094892873857e-07, "loss": 0.001, "num_tokens": 11509359.0, "reward": 13.616416931152344, "reward_std": 1.497071385383606, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5254647731781006, "rewards/kidney_reward/std": 0.5714265704154968, "rewards/length2tails_reward/mean": 0.8009185791015625, "rewards/length2tails_reward/std": 0.28200775384902954, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.549675941467285, "rewards/thermo_reward/std": 0.9590282440185547, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 270.53125, "completions/mean_terminated_length": 270.53125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08928291499614716, "epoch": 2.642, "frac_reward_zero_std": 0.0, "grad_norm": 0.0923282653093338, "learning_rate": 9.423292936646257e-07, "loss": 0.002, "num_tokens": 11518048.0, "reward": 13.902721405029297, "reward_std": 0.30983614921569824, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.6577587127685547, "rewards/length2tails_reward/std": 0.30942097306251526, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 270.9375, "completions/mean_terminated_length": 270.9375, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "entropy": 0.08833121135830879, "epoch": 2.644, "frac_reward_zero_std": 0.0, "grad_norm": 0.06364872306585312, "learning_rate": 9.410491928668515e-07, "loss": -0.0039, "num_tokens": 11526750.0, "reward": 13.665266036987305, "reward_std": 1.437191367149353, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5205302238464355, "rewards/kidney_reward/std": 0.5993408560752869, "rewards/length2tails_reward/mean": 0.765282154083252, "rewards/length2tails_reward/std": 0.2492385059595108, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.607022523880005, "rewards/thermo_reward/std": 0.8501169085502625, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 272.34375, "completions/mean_terminated_length": 272.34375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09671840630471706, "epoch": 2.646, "frac_reward_zero_std": 0.0, "grad_norm": 0.06736798584461212, "learning_rate": 9.397691889988674e-07, "loss": -0.003, "num_tokens": 11535497.0, "reward": 13.687711715698242, "reward_std": 1.0258924961090088, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.812004804611206, "rewards/length2tails_reward/std": 0.22589294612407684, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5735652446746826, "rewards/thermo_reward/std": 0.8364829421043396, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 271.46875, "completions/mean_terminated_length": 271.46875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.0973320035263896, "epoch": 2.648, "frac_reward_zero_std": 0.0, "grad_norm": 0.10905929654836655, "learning_rate": 9.384892841653187e-07, "loss": -0.0027, "num_tokens": 11544216.0, "reward": 13.089970588684082, "reward_std": 2.683310031890869, "rewards/fitness_reward/mean": 7.052721977233887, "rewards/fitness_reward/std": 1.7449299097061157, "rewards/kidney_reward/mean": 2.4832568168640137, "rewards/kidney_reward/std": 0.5357418060302734, "rewards/length2tails_reward/mean": 0.7538886070251465, "rewards/length2tails_reward/std": 0.2668653428554535, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3786025047302246, "rewards/thermo_reward/std": 1.2563246488571167, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 274.03125, "completions/mean_terminated_length": 274.03125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09613270545378327, "epoch": 2.65, "frac_reward_zero_std": 0.0, "grad_norm": 0.0994558036327362, "learning_rate": 9.372094804706866e-07, "loss": -0.0021, "num_tokens": 11553017.0, "reward": 13.69377326965332, "reward_std": 0.6110822558403015, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8361679315567017, "rewards/length2tails_reward/std": 0.231988325715065, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5498504638671875, "rewards/thermo_reward/std": 0.5360844731330872, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.46875, "completions/mean_terminated_length": 272.46875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08980119600892067, "epoch": 2.652, "frac_reward_zero_std": 0.0, "grad_norm": 0.09267576783895493, "learning_rate": 9.359297800192871e-07, "loss": -0.0017, "num_tokens": 11561768.0, "reward": 13.330853462219238, "reward_std": 1.7552409172058105, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5369973182678223, "rewards/kidney_reward/std": 0.5061891078948975, "rewards/length2tails_reward/mean": 0.7579162120819092, "rewards/length2tails_reward/std": 0.28038784861564636, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2568795680999756, "rewards/thermo_reward/std": 1.2792414426803589, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.46875, "completions/mean_terminated_length": 271.46875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08606134541332722, "epoch": 2.654, "frac_reward_zero_std": 0.0, "grad_norm": 0.08788134902715683, "learning_rate": 9.346501849152658e-07, "loss": -0.0094, "num_tokens": 11570487.0, "reward": 12.317151069641113, "reward_std": 4.191816329956055, "rewards/fitness_reward/mean": 6.938035011291504, "rewards/fitness_reward/std": 1.840762972831726, "rewards/kidney_reward/mean": 2.308594226837158, "rewards/kidney_reward/std": 0.9246311187744141, "rewards/length2tails_reward/mean": 0.6851747035980225, "rewards/length2tails_reward/std": 0.3613194525241852, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.902003765106201, "rewards/thermo_reward/std": 2.1534359455108643, "step": 1327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0841665007174015, "epoch": 2.656, "frac_reward_zero_std": 0.0, "grad_norm": 0.0726706013083458, "learning_rate": 9.333706972625954e-07, "loss": -0.0025, "num_tokens": 11579231.0, "reward": 13.50656509399414, "reward_std": 1.1999610662460327, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.7119618654251099, "rewards/length2tails_reward/std": 0.3201157748699188, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.429781913757324, "rewards/thermo_reward/std": 0.9979439377784729, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.6875, "completions/mean_terminated_length": 272.6875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09338252898305655, "epoch": 2.658, "frac_reward_zero_std": 0.0, "grad_norm": 0.14197185635566711, "learning_rate": 9.320913191650723e-07, "loss": -0.0002, "num_tokens": 11587989.0, "reward": 13.265280723571777, "reward_std": 2.2200560569763184, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.487063407897949, "rewards/kidney_reward/std": 0.6476423740386963, "rewards/length2tails_reward/mean": 0.7995427846908569, "rewards/length2tails_reward/std": 0.24626636505126953, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2945876121520996, "rewards/thermo_reward/std": 1.3599261045455933, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.9375, "completions/mean_terminated_length": 272.9375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09512456692755222, "epoch": 2.66, "frac_reward_zero_std": 0.0, "grad_norm": 0.08194391429424286, "learning_rate": 9.308120527263116e-07, "loss": -0.0042, "num_tokens": 11596755.0, "reward": 13.631172180175781, "reward_std": 0.9415711760520935, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8064660429954529, "rewards/length2tails_reward/std": 0.2341540902853012, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4902195930480957, "rewards/thermo_reward/std": 0.8866589069366455, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.3125, "completions/mean_terminated_length": 271.3125, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "entropy": 0.09813211299479008, "epoch": 2.662, "frac_reward_zero_std": 0.0, "grad_norm": 0.10500352084636688, "learning_rate": 9.295329000497459e-07, "loss": -0.0067, "num_tokens": 11605469.0, "reward": 13.039316177368164, "reward_std": 2.732551336288452, "rewards/fitness_reward/mean": 7.0139055252075195, "rewards/fitness_reward/std": 1.9645084142684937, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8129358887672424, "rewards/length2tails_reward/std": 0.243895024061203, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.272355556488037, "rewards/thermo_reward/std": 1.4353864192962646, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.34375, "completions/mean_terminated_length": 272.34375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.09504935797303915, "epoch": 2.664, "frac_reward_zero_std": 0.0, "grad_norm": 0.07366065680980682, "learning_rate": 9.282538632386206e-07, "loss": -0.0012, "num_tokens": 11614216.0, "reward": 13.701459884643555, "reward_std": 0.8884310722351074, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8486068844795227, "rewards/length2tails_reward/std": 0.20267438888549805, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.528935194015503, "rewards/thermo_reward/std": 0.8774805665016174, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.34375, "completions/mean_terminated_length": 272.34375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0826009577140212, "epoch": 2.666, "frac_reward_zero_std": 0.0, "grad_norm": 0.08948934078216553, "learning_rate": 9.269749443959904e-07, "loss": -0.0028, "num_tokens": 11622963.0, "reward": 13.685850143432617, "reward_std": 1.211296796798706, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7733684182167053, "rewards/length2tails_reward/std": 0.2745286822319031, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5482077598571777, "rewards/thermo_reward/std": 1.1658411026000977, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.25, "completions/mean_terminated_length": 273.25, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09233411867171526, "epoch": 2.668, "frac_reward_zero_std": 0.0, "grad_norm": 0.1923236846923828, "learning_rate": 9.25696145624715e-07, "loss": 0.0022, "num_tokens": 11631739.0, "reward": 13.02658748626709, "reward_std": 5.040348529815674, "rewards/fitness_reward/mean": 6.9716691970825195, "rewards/fitness_reward/std": 2.2034339904785156, "rewards/kidney_reward/mean": 2.4078118801116943, "rewards/kidney_reward/std": 1.2369717359542847, "rewards/length2tails_reward/mean": 0.8197587728500366, "rewards/length2tails_reward/std": 0.24441765248775482, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4651312828063965, "rewards/thermo_reward/std": 1.6234033107757568, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09106593765318394, "epoch": 2.67, "frac_reward_zero_std": 0.0, "grad_norm": 0.05460730567574501, "learning_rate": 9.244174690274588e-07, "loss": -0.0042, "num_tokens": 11640487.0, "reward": 13.567729949951172, "reward_std": 1.3579232692718506, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8017369508743286, "rewards/length2tails_reward/std": 0.19972741603851318, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4272513389587402, "rewards/thermo_reward/std": 1.2037054300308228, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.90625, "completions/mean_terminated_length": 270.90625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 0.09610113222151995, "epoch": 2.672, "frac_reward_zero_std": 0.0, "grad_norm": 0.1169869601726532, "learning_rate": 9.231389167066835e-07, "loss": -0.0015, "num_tokens": 11649188.0, "reward": 13.5916109085083, "reward_std": 1.0232844352722168, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8028391003608704, "rewards/length2tails_reward/std": 0.2460554540157318, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4236621856689453, "rewards/thermo_reward/std": 1.0279661417007446, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.4375, "completions/mean_terminated_length": 272.4375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.07874941732734442, "epoch": 2.674, "frac_reward_zero_std": 0.0, "grad_norm": 0.05606454983353615, "learning_rate": 9.218604907646474e-07, "loss": -0.0038, "num_tokens": 11657938.0, "reward": 13.528667449951172, "reward_std": 1.756035566329956, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.526620388031006, "rewards/kidney_reward/std": 0.5648899674415588, "rewards/length2tails_reward/mean": 0.7580454349517822, "rewards/length2tails_reward/std": 0.32083970308303833, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.465057611465454, "rewards/thermo_reward/std": 1.2058982849121094, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 272.625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08984020911157131, "epoch": 2.676, "frac_reward_zero_std": 0.0, "grad_norm": 0.19920657575130463, "learning_rate": 9.205821933034011e-07, "loss": 0.0046, "num_tokens": 11666694.0, "reward": 13.52869987487793, "reward_std": 2.2076985836029053, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4739794731140137, "rewards/kidney_reward/std": 0.8626706004142761, "rewards/length2tails_reward/mean": 0.8132075071334839, "rewards/length2tails_reward/std": 0.2025136947631836, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5122148990631104, "rewards/thermo_reward/std": 1.3630452156066895, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.78125, "completions/mean_terminated_length": 273.78125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.0930999293923378, "epoch": 2.678, "frac_reward_zero_std": 0.0, "grad_norm": 0.08068838715553284, "learning_rate": 9.193040264247828e-07, "loss": 0.0001, "num_tokens": 11675487.0, "reward": 13.657683372497559, "reward_std": 1.52280592918396, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8673219680786133, "rewards/length2tails_reward/std": 0.2095680683851242, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5106451511383057, "rewards/thermo_reward/std": 1.371688723564148, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 271.84375, "completions/mean_terminated_length": 271.84375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09656371735036373, "epoch": 2.68, "frac_reward_zero_std": 0.0, "grad_norm": 1.1636841297149658, "learning_rate": 9.180259922304174e-07, "loss": 0.0087, "num_tokens": 11684218.0, "reward": 13.57703971862793, "reward_std": 1.2407475709915161, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4559011459350586, "rewards/kidney_reward/std": 0.9649369716644287, "rewards/length2tails_reward/mean": 0.7021640539169312, "rewards/length2tails_reward/std": 0.3510373830795288, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.9375, "completions/mean_terminated_length": 271.9375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.09234402794390917, "epoch": 2.682, "frac_reward_zero_std": 0.0, "grad_norm": 0.12080354243516922, "learning_rate": 9.167480928217107e-07, "loss": 0.0034, "num_tokens": 11692952.0, "reward": 13.760887145996094, "reward_std": 0.505234956741333, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8348555564880371, "rewards/length2tails_reward/std": 0.2311815470457077, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.0, "completions/mean_terminated_length": 273.0, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.0820971867069602, "epoch": 2.684, "frac_reward_zero_std": 0.0, "grad_norm": 0.07293231785297394, "learning_rate": 9.154703302998472e-07, "loss": -0.0028, "num_tokens": 11701720.0, "reward": 13.877494812011719, "reward_std": 0.3817107677459717, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8043537139892578, "rewards/length2tails_reward/std": 0.22909440100193024, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.21875, "completions/mean_terminated_length": 271.21875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0913189435377717, "epoch": 2.686, "frac_reward_zero_std": 0.0, "grad_norm": 0.193898007273674, "learning_rate": 9.141927067657868e-07, "loss": 0.0014, "num_tokens": 11710431.0, "reward": 12.636367797851562, "reward_std": 4.728992462158203, "rewards/fitness_reward/mean": 6.928929328918457, "rewards/fitness_reward/std": 1.8890396356582642, "rewards/kidney_reward/mean": 2.3434696197509766, "rewards/kidney_reward/std": 1.1144951581954956, "rewards/length2tails_reward/mean": 0.720057487487793, "rewards/length2tails_reward/std": 0.2935502529144287, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1919631958007812, "rewards/thermo_reward/std": 1.8892871141433716, "step": 1343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.90625, "completions/mean_terminated_length": 272.90625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08928275294601917, "epoch": 2.6879999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.0986161008477211, "learning_rate": 9.129152243202596e-07, "loss": 0.0014, "num_tokens": 11719196.0, "reward": 13.701171875, "reward_std": 1.056113839149475, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8297706842422485, "rewards/length2tails_reward/std": 0.20739682018756866, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.557888984680176, "rewards/thermo_reward/std": 0.9164990186691284, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.0625, "completions/mean_terminated_length": 273.0625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09937832318246365, "epoch": 2.69, "frac_reward_zero_std": 0.0, "grad_norm": 0.11231338977813721, "learning_rate": 9.116378850637649e-07, "loss": 0.0024, "num_tokens": 11727966.0, "reward": 13.841484069824219, "reward_std": 0.42608216404914856, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8431087136268616, "rewards/length2tails_reward/std": 0.208415225148201, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.0625, "completions/mean_terminated_length": 272.0625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0859296889975667, "epoch": 2.692, "frac_reward_zero_std": 0.0, "grad_norm": 0.13398477435112, "learning_rate": 9.103606910965665e-07, "loss": 0.0029, "num_tokens": 11736704.0, "reward": 13.708622932434082, "reward_std": 0.8716744184494019, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.736136794090271, "rewards/length2tails_reward/std": 0.29717540740966797, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5747036933898926, "rewards/thermo_reward/std": 0.8307410478591919, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 272.125, "completions/mean_terminated_length": 272.125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09783033747226, "epoch": 2.694, "frac_reward_zero_std": 0.0, "grad_norm": 0.19264395534992218, "learning_rate": 9.090836445186883e-07, "loss": 0.0033, "num_tokens": 11745444.0, "reward": 12.873300552368164, "reward_std": 3.236984968185425, "rewards/fitness_reward/mean": 7.188657283782959, "rewards/fitness_reward/std": 0.7179933190345764, "rewards/kidney_reward/mean": 2.3760547637939453, "rewards/kidney_reward/std": 0.8919334411621094, "rewards/length2tails_reward/mean": 0.7774170637130737, "rewards/length2tails_reward/std": 0.2432018667459488, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1308465003967285, "rewards/thermo_reward/std": 1.732054591178894, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.5625, "completions/mean_terminated_length": 271.5625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09386403812095523, "epoch": 2.6959999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.1917669028043747, "learning_rate": 9.078067474299132e-07, "loss": 0.0051, "num_tokens": 11754166.0, "reward": 13.039304733276367, "reward_std": 2.959359884262085, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.452592670917511, "rewards/kidney_reward/mean": 2.3586478233337402, "rewards/kidney_reward/std": 1.0929458141326904, "rewards/length2tails_reward/mean": 0.7195242643356323, "rewards/length2tails_reward/std": 0.28090983629226685, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2625389099121094, "rewards/thermo_reward/std": 1.4743518829345703, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.53125, "completions/mean_terminated_length": 273.53125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08278051018714905, "epoch": 2.698, "frac_reward_zero_std": 0.0, "grad_norm": 0.08158128708600998, "learning_rate": 9.06530001929777e-07, "loss": -0.0031, "num_tokens": 11762951.0, "reward": 13.546849250793457, "reward_std": 1.4874722957611084, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5375123023986816, "rewards/kidney_reward/std": 0.503275454044342, "rewards/length2tails_reward/mean": 0.843279242515564, "rewards/length2tails_reward/std": 0.21693500876426697, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4638240337371826, "rewards/thermo_reward/std": 1.0164086818695068, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.5, "completions/mean_terminated_length": 273.5, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "entropy": 0.08491826569661498, "epoch": 2.7, "frac_reward_zero_std": 0.0, "grad_norm": 0.07856491208076477, "learning_rate": 9.05253410117567e-07, "loss": -0.0062, "num_tokens": 11771735.0, "reward": 13.706490516662598, "reward_std": 0.8723586797714233, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8548135161399841, "rewards/length2tails_reward/std": 0.2584093511104584, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.533344268798828, "rewards/thermo_reward/std": 0.8557917475700378, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 273.21875, "completions/mean_terminated_length": 273.21875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08885801862925291, "epoch": 2.702, "frac_reward_zero_std": 0.0, "grad_norm": 0.10287737846374512, "learning_rate": 9.039769740923182e-07, "loss": 0.0056, "num_tokens": 11780510.0, "reward": 13.050555229187012, "reward_std": 4.520473957061768, "rewards/fitness_reward/mean": 7.021054267883301, "rewards/fitness_reward/std": 1.924071192741394, "rewards/kidney_reward/mean": 2.3942604064941406, "rewards/kidney_reward/std": 1.1641942262649536, "rewards/length2tails_reward/mean": 0.8088257908821106, "rewards/length2tails_reward/std": 0.22004175186157227, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4543581008911133, "rewards/thermo_reward/std": 1.4701238870620728, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.5, "completions/mean_terminated_length": 273.5, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09847811609506607, "epoch": 2.7039999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.23275437951087952, "learning_rate": 9.027006959528083e-07, "loss": 0.0021, "num_tokens": 11789294.0, "reward": 13.7213773727417, "reward_std": 0.5335816740989685, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8386157155036926, "rewards/length2tails_reward/std": 0.21023456752300262, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5498504638671875, "rewards/thermo_reward/std": 0.5360844731330872, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 272.625, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "entropy": 0.09279504232108593, "epoch": 2.706, "frac_reward_zero_std": 0.0, "grad_norm": 0.13962967693805695, "learning_rate": 9.014245777975564e-07, "loss": 0.007, "num_tokens": 11798050.0, "reward": 13.785286903381348, "reward_std": 0.5953236818313599, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8283153772354126, "rewards/length2tails_reward/std": 0.20525197684764862, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.9375, "completions/mean_terminated_length": 270.9375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08352559478953481, "epoch": 2.708, "frac_reward_zero_std": 0.0, "grad_norm": 0.1153552308678627, "learning_rate": 9.00148621724818e-07, "loss": -0.0036, "num_tokens": 11806752.0, "reward": 13.292638778686523, "reward_std": 3.030071973800659, "rewards/fitness_reward/mean": 7.052952766418457, "rewards/fitness_reward/std": 1.7436254024505615, "rewards/kidney_reward/mean": 2.5388498306274414, "rewards/kidney_reward/std": 0.49570968747138977, "rewards/length2tails_reward/mean": 0.6864029765129089, "rewards/length2tails_reward/std": 0.297585666179657, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.532196044921875, "rewards/thermo_reward/std": 0.8614235520362854, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.21875, "completions/mean_terminated_length": 273.21875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09010605234652758, "epoch": 2.71, "frac_reward_zero_std": 0.0, "grad_norm": 0.12815427780151367, "learning_rate": 8.988728298325821e-07, "loss": 0.0031, "num_tokens": 11815527.0, "reward": 13.77267074584961, "reward_std": 0.549005389213562, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8274263739585876, "rewards/length2tails_reward/std": 0.2377612441778183, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.34375, "completions/mean_terminated_length": 271.34375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0824267603456974, "epoch": 2.7119999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.08523330837488174, "learning_rate": 8.975972042185687e-07, "loss": -0.0042, "num_tokens": 11824242.0, "reward": 13.403091430664062, "reward_std": 1.459092378616333, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.510202169418335, "rewards/kidney_reward/std": 0.5214763283729553, "rewards/length2tails_reward/mean": 0.6743561029434204, "rewards/length2tails_reward/std": 0.30871862173080444, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4217782020568848, "rewards/thermo_reward/std": 1.0372663736343384, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.6875, "completions/mean_terminated_length": 271.6875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09593377634882927, "epoch": 2.714, "frac_reward_zero_std": 0.0, "grad_norm": 0.32700106501579285, "learning_rate": 8.963217469802226e-07, "loss": 0.005, "num_tokens": 11832968.0, "reward": 13.640189170837402, "reward_std": 1.546877145767212, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5230116844177246, "rewards/kidney_reward/std": 0.585303783416748, "rewards/length2tails_reward/mean": 0.744003176689148, "rewards/length2tails_reward/std": 0.2514055371284485, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.581591844558716, "rewards/thermo_reward/std": 0.9852604269981384, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.71875, "completions/mean_terminated_length": 271.71875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08936795685440302, "epoch": 2.716, "frac_reward_zero_std": 0.0, "grad_norm": 0.07124438136816025, "learning_rate": 8.950464602147132e-07, "loss": -0.0029, "num_tokens": 11841695.0, "reward": 13.338068008422852, "reward_std": 1.5099149942398071, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7638334631919861, "rewards/length2tails_reward/std": 0.24104620516300201, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2287392616271973, "rewards/thermo_reward/std": 1.305456519126892, "step": 1358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.03125, "completions/mean_terminated_length": 271.03125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09379763435572386, "epoch": 2.718, "frac_reward_zero_std": 0.0, "grad_norm": 0.13047315180301666, "learning_rate": 8.93771346018929e-07, "loss": 0.0011, "num_tokens": 11850400.0, "reward": 12.898988723754883, "reward_std": 3.223174571990967, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.650642991065979, "rewards/kidney_reward/mean": 2.3975424766540527, "rewards/kidney_reward/std": 0.9989736676216125, "rewards/length2tails_reward/mean": 0.7121099233627319, "rewards/length2tails_reward/std": 0.2626727223396301, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0840697288513184, "rewards/thermo_reward/std": 1.7830955982208252, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.1875, "completions/mean_terminated_length": 271.1875, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "entropy": 0.07782580750063062, "epoch": 2.7199999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.10690188407897949, "learning_rate": 8.924964064894753e-07, "loss": 0.0012, "num_tokens": 11859110.0, "reward": 13.796202659606934, "reward_std": 0.47236502170562744, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7891558408737183, "rewards/length2tails_reward/std": 0.2034577876329422, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 271.46875, "completions/mean_terminated_length": 271.46875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.08494175039231777, "epoch": 2.722, "frac_reward_zero_std": 0.0, "grad_norm": 0.12061955779790878, "learning_rate": 8.912216437226691e-07, "loss": 0.0057, "num_tokens": 11867829.0, "reward": 13.42155647277832, "reward_std": 1.3494116067886353, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7937526106834412, "rewards/length2tails_reward/std": 0.21244557201862335, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.309235095977783, "rewards/thermo_reward/std": 1.156217098236084, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.5625, "completions/mean_terminated_length": 272.5625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09255943074822426, "epoch": 2.724, "frac_reward_zero_std": 0.0, "grad_norm": 0.10911893844604492, "learning_rate": 8.899470598145384e-07, "loss": -0.0028, "num_tokens": 11876583.0, "reward": 13.61441421508789, "reward_std": 1.3774797916412354, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.806898832321167, "rewards/length2tails_reward/std": 0.25244489312171936, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4734182357788086, "rewards/thermo_reward/std": 1.2543762922286987, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.6875, "completions/mean_terminated_length": 272.6875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09354557655751705, "epoch": 2.726, "frac_reward_zero_std": 0.0, "grad_norm": 0.12077096849679947, "learning_rate": 8.886726568608154e-07, "loss": 0.0025, "num_tokens": 11885341.0, "reward": 13.799593925476074, "reward_std": 0.46591177582740784, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8230618834495544, "rewards/length2tails_reward/std": 0.21043244004249573, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.84375, "completions/mean_terminated_length": 272.84375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08867257367819548, "epoch": 2.7279999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.07082948088645935, "learning_rate": 8.873984369569358e-07, "loss": -0.0058, "num_tokens": 11894104.0, "reward": 13.649602890014648, "reward_std": 1.1290522813796997, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7938203811645508, "rewards/length2tails_reward/std": 0.2440522462129593, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5099151134490967, "rewards/thermo_reward/std": 0.9728322625160217, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08561956882476807, "epoch": 2.73, "frac_reward_zero_std": 0.0, "grad_norm": 0.09584212303161621, "learning_rate": 8.861244021980343e-07, "loss": -0.0029, "num_tokens": 11902832.0, "reward": 13.62385368347168, "reward_std": 1.2382583618164062, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7532392740249634, "rewards/length2tails_reward/std": 0.28448599576950073, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4882240295410156, "rewards/thermo_reward/std": 1.0843724012374878, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.75, "completions/mean_terminated_length": 272.75, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09875656943768263, "epoch": 2.732, "frac_reward_zero_std": 0.0, "grad_norm": 0.08536585420370102, "learning_rate": 8.848505546789406e-07, "loss": -0.0008, "num_tokens": 11911592.0, "reward": 13.630233764648438, "reward_std": 1.0378588438034058, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8287537097930908, "rewards/length2tails_reward/std": 0.1734320968389511, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4870524406433105, "rewards/thermo_reward/std": 0.9019082188606262, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 271.84375, "completions/mean_terminated_length": 271.84375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09804850351065397, "epoch": 2.734, "frac_reward_zero_std": 0.0, "grad_norm": 0.1275150179862976, "learning_rate": 8.835768964941772e-07, "loss": -0.0042, "num_tokens": 11920323.0, "reward": 13.905082702636719, "reward_std": 0.3192027807235718, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.6813804507255554, "rewards/length2tails_reward/std": 0.2995351254940033, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.78125, "completions/mean_terminated_length": 271.78125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08263131324201822, "epoch": 2.7359999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.09288492053747177, "learning_rate": 8.823034297379546e-07, "loss": -0.0018, "num_tokens": 11929052.0, "reward": 13.093076705932617, "reward_std": 3.034579038619995, "rewards/fitness_reward/mean": 6.9872846603393555, "rewards/fitness_reward/std": 2.1151013374328613, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7136950492858887, "rewards/length2tails_reward/std": 0.33012154698371887, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3353025913238525, "rewards/thermo_reward/std": 0.9062677621841431, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.9375, "completions/mean_terminated_length": 271.9375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08855268033221364, "epoch": 2.738, "frac_reward_zero_std": 0.0, "grad_norm": 0.09174522757530212, "learning_rate": 8.810301565041691e-07, "loss": -0.0028, "num_tokens": 11937786.0, "reward": 13.751238822937012, "reward_std": 0.509857177734375, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7383784055709839, "rewards/length2tails_reward/std": 0.29278847575187683, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.90625, "completions/mean_terminated_length": 270.90625, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "entropy": 0.09602725226432085, "epoch": 2.74, "frac_reward_zero_std": 0.0, "grad_norm": 0.1331060528755188, "learning_rate": 8.797570788863988e-07, "loss": -0.002, "num_tokens": 11946487.0, "reward": 13.524124145507812, "reward_std": 1.7451503276824951, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.510956287384033, "rewards/kidney_reward/std": 0.5174046158790588, "rewards/length2tails_reward/mean": 0.7649425864219666, "rewards/length2tails_reward/std": 0.2521741986274719, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.475489616394043, "rewards/thermo_reward/std": 1.2552281618118286, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.25, "completions/mean_terminated_length": 271.25, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09053944982588291, "epoch": 2.742, "frac_reward_zero_std": 0.0, "grad_norm": 0.10725681483745575, "learning_rate": 8.784841989778996e-07, "loss": 0.0016, "num_tokens": 11955199.0, "reward": 13.909860610961914, "reward_std": 0.31133556365966797, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7291494011878967, "rewards/length2tails_reward/std": 0.26707524061203003, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08970591751858592, "epoch": 2.7439999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.10110809653997421, "learning_rate": 8.772115188716032e-07, "loss": 0.0012, "num_tokens": 11963915.0, "reward": 13.511757850646973, "reward_std": 1.1564346551895142, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.6929255723953247, "rewards/length2tails_reward/std": 0.3160499632358551, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.436877965927124, "rewards/thermo_reward/std": 0.9635177850723267, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 272.625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08349447511136532, "epoch": 2.746, "frac_reward_zero_std": 0.0, "grad_norm": 0.08993352204561234, "learning_rate": 8.759390406601115e-07, "loss": -0.0005, "num_tokens": 11972671.0, "reward": 13.653955459594727, "reward_std": 0.9975864887237549, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.799585223197937, "rewards/length2tails_reward/std": 0.20988598465919495, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.513690710067749, "rewards/thermo_reward/std": 0.9536970257759094, "step": 1373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.90625, "completions/mean_terminated_length": 273.90625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09851919766515493, "epoch": 2.748, "frac_reward_zero_std": 0.0, "grad_norm": 0.14624251425266266, "learning_rate": 8.746667664356955e-07, "loss": 0.0009, "num_tokens": 11981468.0, "reward": 13.551703453063965, "reward_std": 1.1514352560043335, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8635172843933105, "rewards/length2tails_reward/std": 0.1821584850549698, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4324052333831787, "rewards/thermo_reward/std": 0.9851658940315247, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.78125, "completions/mean_terminated_length": 271.78125, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "entropy": 0.08834454417228699, "epoch": 2.75, "frac_reward_zero_std": 0.0, "grad_norm": 0.1254044473171234, "learning_rate": 8.733946982902911e-07, "loss": 0.004, "num_tokens": 11990197.0, "reward": 13.839938163757324, "reward_std": 0.4260871112346649, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8276517987251282, "rewards/length2tails_reward/std": 0.20928896963596344, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 271.96875, "completions/mean_terminated_length": 271.96875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08731277380138636, "epoch": 2.752, "frac_reward_zero_std": 0.0, "grad_norm": 0.13713133335113525, "learning_rate": 8.721228383154939e-07, "loss": 0.0008, "num_tokens": 11998932.0, "reward": 13.650564193725586, "reward_std": 0.9174665212631226, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7925777435302734, "rewards/length2tails_reward/std": 0.23623481392860413, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4836416244506836, "rewards/thermo_reward/std": 0.9184372425079346, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08832689933478832, "epoch": 2.754, "frac_reward_zero_std": 0.0, "grad_norm": 0.0689794272184372, "learning_rate": 8.708511886025583e-07, "loss": -0.0055, "num_tokens": 12007696.0, "reward": 13.400384902954102, "reward_std": 2.495495557785034, "rewards/fitness_reward/mean": 7.051910400390625, "rewards/fitness_reward/std": 1.7495219707489014, "rewards/kidney_reward/mean": 2.5385398864746094, "rewards/kidney_reward/std": 0.49746304750442505, "rewards/length2tails_reward/mean": 0.8031256198883057, "rewards/length2tails_reward/std": 0.25186508893966675, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.0625, "completions/mean_terminated_length": 272.0625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08293364942073822, "epoch": 2.7560000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.12998053431510925, "learning_rate": 8.695797512423931e-07, "loss": 0.0027, "num_tokens": 12016434.0, "reward": 13.634329795837402, "reward_std": 0.5833243727684021, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7658626437187195, "rewards/length2tails_reward/std": 0.2717556953430176, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.470078706741333, "rewards/thermo_reward/std": 0.5830413699150085, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.78125, "completions/mean_terminated_length": 271.78125, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 0.086912976577878, "epoch": 2.758, "frac_reward_zero_std": 0.0, "grad_norm": 0.06801780313253403, "learning_rate": 8.683085283255576e-07, "loss": -0.0011, "num_tokens": 12025163.0, "reward": 13.779071807861328, "reward_std": 0.845180094242096, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8288226127624512, "rewards/length2tails_reward/std": 0.22464440762996674, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.6085243225097656, "rewards/thermo_reward/std": 0.8422261476516724, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 270.90625, "completions/mean_terminated_length": 270.90625, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "entropy": 0.0917194364592433, "epoch": 2.76, "frac_reward_zero_std": 0.0, "grad_norm": 0.16181787848472595, "learning_rate": 8.670375219422577e-07, "loss": -0.005, "num_tokens": 12033864.0, "reward": 12.928831100463867, "reward_std": 2.7798335552215576, "rewards/fitness_reward/mean": 6.994358062744141, "rewards/fitness_reward/std": 1.7694021463394165, "rewards/kidney_reward/mean": 2.5039236545562744, "rewards/kidney_reward/std": 0.5554940104484558, "rewards/length2tails_reward/mean": 0.7913016080856323, "rewards/length2tails_reward/std": 0.2935396432876587, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.251418352127075, "rewards/thermo_reward/std": 1.1121165752410889, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.53125, "completions/mean_terminated_length": 271.53125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.089608708396554, "epoch": 2.762, "frac_reward_zero_std": 0.0, "grad_norm": 0.15328380465507507, "learning_rate": 8.657667341823448e-07, "loss": 0.0052, "num_tokens": 12042585.0, "reward": 13.086148262023926, "reward_std": 2.592722177505493, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.3998496532440186, "rewards/kidney_reward/std": 0.796771228313446, "rewards/length2tails_reward/mean": 0.7404778599739075, "rewards/length2tails_reward/std": 0.28793251514434814, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2085750102996826, "rewards/thermo_reward/std": 1.6067379713058472, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 754.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 285.6875, "completions/mean_terminated_length": 270.58062744140625, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "entropy": 0.0945020318031311, "epoch": 2.7640000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.5476410984992981, "learning_rate": 8.644961671353095e-07, "loss": -0.0133, "num_tokens": 12051759.0, "reward": 13.800397872924805, "reward_std": 0.882570743560791, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7528252601623535, "rewards/length2tails_reward/std": 0.2854486107826233, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.6374502182006836, "rewards/thermo_reward/std": 0.8803324103355408, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.28125, "completions/mean_terminated_length": 272.28125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09110002871602774, "epoch": 2.766, "frac_reward_zero_std": 0.0, "grad_norm": 0.09588117897510529, "learning_rate": 8.632258228902804e-07, "loss": -0.0019, "num_tokens": 12060504.0, "reward": 13.497045516967773, "reward_std": 1.180594801902771, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7557010054588318, "rewards/length2tails_reward/std": 0.24613431096076965, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3885293006896973, "rewards/thermo_reward/std": 1.0135005712509155, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.40625, "completions/mean_terminated_length": 273.40625, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.09245431423187256, "epoch": 2.768, "frac_reward_zero_std": 0.0, "grad_norm": 0.4146077036857605, "learning_rate": 8.619557035360195e-07, "loss": -0.0036, "num_tokens": 12069285.0, "reward": 13.30376148223877, "reward_std": 2.591141939163208, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.4650182723999023, "rewards/kidney_reward/std": 0.7693151831626892, "rewards/length2tails_reward/mean": 0.845126748085022, "rewards/length2tails_reward/std": 0.19638904929161072, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3505544662475586, "rewards/thermo_reward/std": 1.5948100090026855, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 271.78125, "completions/mean_terminated_length": 271.78125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08536792267113924, "epoch": 2.77, "frac_reward_zero_std": 0.0, "grad_norm": 0.10885151475667953, "learning_rate": 8.606858111609187e-07, "loss": -0.0003, "num_tokens": 12078014.0, "reward": 13.27672004699707, "reward_std": 2.586475133895874, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4506547451019287, "rewards/kidney_reward/std": 0.8490661382675171, "rewards/length2tails_reward/mean": 0.7651956677436829, "rewards/length2tails_reward/std": 0.270557165145874, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2883613109588623, "rewards/thermo_reward/std": 1.8263949155807495, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 274.4375, "completions/mean_terminated_length": 274.4375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08724306477233768, "epoch": 2.7720000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.08859222382307053, "learning_rate": 8.594161478529973e-07, "loss": -0.007, "num_tokens": 12086828.0, "reward": 13.583634376525879, "reward_std": 1.3034932613372803, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8490362167358398, "rewards/length2tails_reward/std": 0.22278374433517456, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4110660552978516, "rewards/thermo_reward/std": 1.2885096073150635, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.40625, "completions/mean_terminated_length": 272.40625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08765661623328924, "epoch": 2.774, "frac_reward_zero_std": 0.0, "grad_norm": 0.46068093180656433, "learning_rate": 8.58146715699897e-07, "loss": 0.0018, "num_tokens": 12095577.0, "reward": 11.891407012939453, "reward_std": 6.213120460510254, "rewards/fitness_reward/mean": 6.643294334411621, "rewards/fitness_reward/std": 2.8548996448516846, "rewards/kidney_reward/mean": 2.1450510025024414, "rewards/kidney_reward/std": 1.608840823173523, "rewards/length2tails_reward/mean": 0.7848142981529236, "rewards/length2tails_reward/std": 0.25247421860694885, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.9245800971984863, "rewards/thermo_reward/std": 2.1157755851745605, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 271.78125, "completions/mean_terminated_length": 271.78125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "entropy": 0.09168736450374126, "epoch": 2.776, "frac_reward_zero_std": 0.0, "grad_norm": 0.10756999254226685, "learning_rate": 8.568775167888805e-07, "loss": -0.0002, "num_tokens": 12104306.0, "reward": 13.713671684265137, "reward_std": 0.965480625629425, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7663874626159668, "rewards/length2tails_reward/std": 0.2602115273475647, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.549368143081665, "rewards/thermo_reward/std": 0.9606295228004456, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.90625, "completions/mean_terminated_length": 271.90625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09946115221828222, "epoch": 2.778, "frac_reward_zero_std": 0.0, "grad_norm": 0.05923334136605263, "learning_rate": 8.556085532068266e-07, "loss": -0.005, "num_tokens": 12113039.0, "reward": 13.344707489013672, "reward_std": 3.0077295303344727, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.5390896797180176, "rewards/kidney_reward/std": 0.49435171484947205, "rewards/length2tails_reward/mean": 0.7656229138374329, "rewards/length2tails_reward/std": 0.27780574560165405, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5760021209716797, "rewards/thermo_reward/std": 0.8242037892341614, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.90625, "completions/mean_terminated_length": 272.90625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08487782347947359, "epoch": 2.7800000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.1074460819363594, "learning_rate": 8.543398270402264e-07, "loss": -0.0027, "num_tokens": 12121804.0, "reward": 13.341554641723633, "reward_std": 1.6601860523223877, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7996214032173157, "rewards/length2tails_reward/std": 0.27192988991737366, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2286460399627686, "rewards/thermo_reward/std": 1.4481639862060547, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.90625, "completions/mean_terminated_length": 272.90625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08386108372360468, "epoch": 2.782, "frac_reward_zero_std": 0.0, "grad_norm": 0.10229164361953735, "learning_rate": 8.530713403751821e-07, "loss": 0.0001, "num_tokens": 12130569.0, "reward": 13.57174015045166, "reward_std": 0.6612291932106018, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.812415361404419, "rewards/length2tails_reward/std": 0.2513531446456909, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4301929473876953, "rewards/thermo_reward/std": 0.6010707020759583, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.21875, "completions/mean_terminated_length": 271.21875, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "entropy": 0.08745740633457899, "epoch": 2.784, "frac_reward_zero_std": 0.0, "grad_norm": 0.15082944929599762, "learning_rate": 8.518030952974009e-07, "loss": 0.0019, "num_tokens": 12139280.0, "reward": 13.673470497131348, "reward_std": 0.9771033525466919, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7747855186462402, "rewards/length2tails_reward/std": 0.25838014483451843, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5356860160827637, "rewards/thermo_reward/std": 0.8443465232849121, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.0625, "completions/mean_terminated_length": 272.0625, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "entropy": 0.0878392974846065, "epoch": 2.786, "frac_reward_zero_std": 0.0, "grad_norm": 0.0767536535859108, "learning_rate": 8.50535093892193e-07, "loss": 0.0034, "num_tokens": 12148018.0, "reward": 13.795819282531738, "reward_std": 0.46981340646743774, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7853190898895264, "rewards/length2tails_reward/std": 0.24550072848796844, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.28125, "completions/mean_terminated_length": 272.28125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09728020057082176, "epoch": 2.7880000000000003, "frac_reward_zero_std": 0.0, "grad_norm": 0.10178124159574509, "learning_rate": 8.492673382444686e-07, "loss": -0.0005, "num_tokens": 12156763.0, "reward": 13.9190673828125, "reward_std": 0.3134841024875641, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8212195634841919, "rewards/length2tails_reward/std": 0.1988692432641983, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.59375, "completions/mean_terminated_length": 271.59375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08629786409437656, "epoch": 2.79, "frac_reward_zero_std": 0.0, "grad_norm": 0.058435454964637756, "learning_rate": 8.479998304387328e-07, "loss": -0.005, "num_tokens": 12165486.0, "reward": 13.33970832824707, "reward_std": 3.007207155227661, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.5390896797180176, "rewards/kidney_reward/std": 0.49435171484947205, "rewards/length2tails_reward/mean": 0.7156283855438232, "rewards/length2tails_reward/std": 0.30288150906562805, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5760021209716797, "rewards/thermo_reward/std": 0.8242037892341614, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.75, "completions/mean_terminated_length": 269.75, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.09454571455717087, "epoch": 2.792, "frac_reward_zero_std": 0.0, "grad_norm": 0.09265205264091492, "learning_rate": 8.46732572559084e-07, "loss": -0.0003, "num_tokens": 12174150.0, "reward": 13.47976303100586, "reward_std": 1.12965989112854, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7220099568367004, "rewards/length2tails_reward/std": 0.24405024945735931, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.319897174835205, "rewards/thermo_reward/std": 1.1181665658950806, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 284.15625, "completions/mean_terminated_length": 284.15625, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "entropy": 0.11376719083636999, "epoch": 2.794, "frac_reward_zero_std": 0.0, "grad_norm": 0.5755971074104309, "learning_rate": 8.454655666892094e-07, "loss": -0.0072, "num_tokens": 12183275.0, "reward": 13.46137523651123, "reward_std": 1.9065113067626953, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.520040273666382, "rewards/kidney_reward/std": 0.6021116971969604, "rewards/length2tails_reward/mean": 0.7784010171890259, "rewards/length2tails_reward/std": 0.28705984354019165, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4598193168640137, "rewards/thermo_reward/std": 1.0365558862686157, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.5, "completions/mean_terminated_length": 271.5, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.08584170322865248, "epoch": 2.7960000000000003, "frac_reward_zero_std": 0.0, "grad_norm": 0.0858711525797844, "learning_rate": 8.441988149123817e-07, "loss": 0.0018, "num_tokens": 12191995.0, "reward": 13.758430480957031, "reward_std": 1.163153886795044, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8490825891494751, "rewards/length2tails_reward/std": 0.18313346803188324, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.6132168769836426, "rewards/thermo_reward/std": 1.013451099395752, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.03125, "completions/mean_terminated_length": 273.03125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08808640111237764, "epoch": 2.798, "frac_reward_zero_std": 0.0, "grad_norm": 0.05051012709736824, "learning_rate": 8.42932319311456e-07, "loss": -0.0034, "num_tokens": 12200764.0, "reward": 13.767698287963867, "reward_std": 0.9061033725738525, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8230335712432861, "rewards/length2tails_reward/std": 0.24817439913749695, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5977301597595215, "rewards/thermo_reward/std": 0.8991841077804565, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.53125, "completions/mean_terminated_length": 272.53125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09016821812838316, "epoch": 2.8, "frac_reward_zero_std": 0.0, "grad_norm": 0.08677458018064499, "learning_rate": 8.416660819688658e-07, "loss": -0.0029, "num_tokens": 12209517.0, "reward": 13.91813850402832, "reward_std": 0.3208127021789551, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8119388818740845, "rewards/length2tails_reward/std": 0.25621432065963745, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.375, "completions/mean_terminated_length": 273.375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08616777881979942, "epoch": 2.802, "frac_reward_zero_std": 0.0, "grad_norm": 0.12825022637844086, "learning_rate": 8.40400104966621e-07, "loss": 0.0059, "num_tokens": 12218297.0, "reward": 13.762215614318848, "reward_std": 0.5018707513809204, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8481432795524597, "rewards/length2tails_reward/std": 0.17002004384994507, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.90625, "completions/mean_terminated_length": 271.90625, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "entropy": 0.08310704492032528, "epoch": 2.8040000000000003, "frac_reward_zero_std": 0.0, "grad_norm": 0.06590055674314499, "learning_rate": 8.391343903863017e-07, "loss": -0.0002, "num_tokens": 12227030.0, "reward": 13.777181625366211, "reward_std": 1.0704293251037598, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8627910614013672, "rewards/length2tails_reward/std": 0.1706744283437729, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.6305975914001465, "rewards/thermo_reward/std": 0.9178563356399536, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.625, "completions/mean_terminated_length": 270.625, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "entropy": 0.08667949680238962, "epoch": 2.806, "frac_reward_zero_std": 0.0, "grad_norm": 0.24901294708251953, "learning_rate": 8.378689403090582e-07, "loss": 0.0021, "num_tokens": 12235722.0, "reward": 13.747515678405762, "reward_std": 0.4992513060569763, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7011414766311646, "rewards/length2tails_reward/std": 0.25532442331314087, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.53125, "completions/mean_terminated_length": 272.53125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.07546252477914095, "epoch": 2.808, "frac_reward_zero_std": 0.0, "grad_norm": 0.1531989723443985, "learning_rate": 8.366037568156047e-07, "loss": -0.0059, "num_tokens": 12244475.0, "reward": 13.314666748046875, "reward_std": 2.8027918338775635, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.4331979751586914, "rewards/kidney_reward/std": 0.9463348984718323, "rewards/length2tails_reward/mean": 0.7847499251365662, "rewards/length2tails_reward/std": 0.23787519335746765, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3993189334869385, "rewards/thermo_reward/std": 1.6910643577575684, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 272.625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09690284915268421, "epoch": 2.81, "frac_reward_zero_std": 0.0, "grad_norm": 0.12429013848304749, "learning_rate": 8.353388419862178e-07, "loss": -0.0065, "num_tokens": 12253231.0, "reward": 13.412312507629395, "reward_std": 1.657273530960083, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8280727863311768, "rewards/length2tails_reward/std": 0.23345917463302612, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.354067802429199, "rewards/thermo_reward/std": 1.2732762098312378, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.65625, "completions/mean_terminated_length": 270.65625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.08442273642867804, "epoch": 2.8120000000000003, "frac_reward_zero_std": 0.0, "grad_norm": 0.09026562422513962, "learning_rate": 8.340741979007324e-07, "loss": 0.0047, "num_tokens": 12261924.0, "reward": 13.80896282196045, "reward_std": 0.5169821977615356, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7914867401123047, "rewards/length2tails_reward/std": 0.21802614629268646, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.09375, "completions/mean_terminated_length": 272.09375, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "entropy": 0.08891510032117367, "epoch": 2.814, "frac_reward_zero_std": 0.0, "grad_norm": 0.09019751101732254, "learning_rate": 8.328098266385373e-07, "loss": 0.0019, "num_tokens": 12270663.0, "reward": 13.716066360473633, "reward_std": 1.1704113483428955, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8142136335372925, "rewards/length2tails_reward/std": 0.22736941277980804, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.546980381011963, "rewards/thermo_reward/std": 1.1725293397903442, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 274.21875, "completions/mean_terminated_length": 274.21875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09006855543702841, "epoch": 2.816, "frac_reward_zero_std": 0.0, "grad_norm": 0.08318524062633514, "learning_rate": 8.315457302785742e-07, "loss": -0.0012, "num_tokens": 12279470.0, "reward": 13.925493240356445, "reward_std": 0.313385009765625, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8854849934577942, "rewards/length2tails_reward/std": 0.14626377820968628, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.375, "completions/mean_terminated_length": 273.375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09508083388209343, "epoch": 2.818, "frac_reward_zero_std": 0.0, "grad_norm": 0.06987155228853226, "learning_rate": 8.302819108993311e-07, "loss": -0.0013, "num_tokens": 12288250.0, "reward": 13.839058876037598, "reward_std": 0.4329962432384491, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8188518285751343, "rewards/length2tails_reward/std": 0.2515736520290375, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.03125, "completions/mean_terminated_length": 272.03125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08047976158559322, "epoch": 2.82, "frac_reward_zero_std": 0.0, "grad_norm": 0.10217437148094177, "learning_rate": 8.290183705788418e-07, "loss": -0.0044, "num_tokens": 12296987.0, "reward": 13.645854949951172, "reward_std": 1.115564227104187, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.539055347442627, "rewards/kidney_reward/std": 0.4945458173751831, "rewards/length2tails_reward/mean": 0.7350102663040161, "rewards/length2tails_reward/std": 0.2945704162120819, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.09375, "completions/mean_terminated_length": 271.09375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.08697906043380499, "epoch": 2.822, "frac_reward_zero_std": 0.0, "grad_norm": 0.09989477694034576, "learning_rate": 8.277551113946811e-07, "loss": 0.0034, "num_tokens": 12305694.0, "reward": 13.754642486572266, "reward_std": 0.5036185383796692, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7724176645278931, "rewards/length2tails_reward/std": 0.25681835412979126, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.34375, "completions/mean_terminated_length": 272.34375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08093288633972406, "epoch": 2.824, "frac_reward_zero_std": 0.0, "grad_norm": 0.0949145033955574, "learning_rate": 8.264921354239608e-07, "loss": 0.0011, "num_tokens": 12314441.0, "reward": 13.752120971679688, "reward_std": 0.5054385662078857, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7471965551376343, "rewards/length2tails_reward/std": 0.2987217903137207, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.9375, "completions/mean_terminated_length": 271.9375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08343170490115881, "epoch": 2.826, "frac_reward_zero_std": 0.0, "grad_norm": 0.061486851423978806, "learning_rate": 8.252294447433282e-07, "loss": -0.0049, "num_tokens": 12323175.0, "reward": 13.275848388671875, "reward_std": 3.016287326812744, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.511730670928955, "rewards/kidney_reward/std": 0.5132253766059875, "rewards/length2tails_reward/mean": 0.7494857311248779, "rewards/length2tails_reward/std": 0.27652037143707275, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.536116123199463, "rewards/thermo_reward/std": 0.8422485589981079, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 273.375, "completions/mean_terminated_length": 273.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08979653380811214, "epoch": 2.828, "frac_reward_zero_std": 0.0, "grad_norm": 0.1077917143702507, "learning_rate": 8.239670414289602e-07, "loss": -0.0046, "num_tokens": 12331955.0, "reward": 13.600502967834473, "reward_std": 1.2754502296447754, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7997991442680359, "rewards/length2tails_reward/std": 0.27430152893066406, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.487576723098755, "rewards/thermo_reward/std": 1.0877385139465332, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.15625, "completions/mean_terminated_length": 272.15625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08586056530475616, "epoch": 2.83, "frac_reward_zero_std": 0.0, "grad_norm": 0.11145995557308197, "learning_rate": 8.227049275565622e-07, "loss": -0.002, "num_tokens": 12340696.0, "reward": 13.244773864746094, "reward_std": 2.0113065242767334, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4455184936523438, "rewards/kidney_reward/std": 0.7122268080711365, "rewards/length2tails_reward/mean": 0.7700451612472534, "rewards/length2tails_reward/std": 0.2829984128475189, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2610652446746826, "rewards/thermo_reward/std": 1.3290748596191406, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 287.21875, "completions/mean_terminated_length": 287.21875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.09500201977789402, "epoch": 2.832, "frac_reward_zero_std": 0.0, "grad_norm": 0.7574090361595154, "learning_rate": 8.214431052013634e-07, "loss": -0.0228, "num_tokens": 12349919.0, "reward": 13.81132698059082, "reward_std": 0.5325340032577515, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8151376843452454, "rewards/length2tails_reward/std": 0.28029194474220276, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.40625, "completions/mean_terminated_length": 272.40625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08386566769331694, "epoch": 2.834, "frac_reward_zero_std": 0.0, "grad_norm": 0.10139825195074081, "learning_rate": 8.201815764381133e-07, "loss": -0.0038, "num_tokens": 12358668.0, "reward": 12.858728408813477, "reward_std": 3.645132541656494, "rewards/fitness_reward/mean": 7.188657760620117, "rewards/fitness_reward/std": 0.7179933190345764, "rewards/kidney_reward/mean": 2.335433006286621, "rewards/kidney_reward/std": 1.1454758644104004, "rewards/length2tails_reward/mean": 0.7713392972946167, "rewards/length2tails_reward/std": 0.2946726381778717, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.157503604888916, "rewards/thermo_reward/std": 1.9195927381515503, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.9375, "completions/mean_terminated_length": 272.9375, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "entropy": 0.08110349159687757, "epoch": 2.836, "frac_reward_zero_std": 0.0, "grad_norm": 0.07257678359746933, "learning_rate": 8.189203433410794e-07, "loss": 0.0014, "num_tokens": 12367434.0, "reward": 13.879064559936523, "reward_std": 0.38343676924705505, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8200562000274658, "rewards/length2tails_reward/std": 0.276693731546402, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.40625, "completions/mean_terminated_length": 271.40625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.07928761281073093, "epoch": 2.838, "frac_reward_zero_std": 0.0, "grad_norm": 0.0934724509716034, "learning_rate": 8.176594079840422e-07, "loss": -0.0064, "num_tokens": 12376151.0, "reward": 13.905878067016602, "reward_std": 0.3248470723628998, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.6893353462219238, "rewards/length2tails_reward/std": 0.30571213364601135, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.8125, "completions/mean_terminated_length": 272.8125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0859050927683711, "epoch": 2.84, "frac_reward_zero_std": 0.0, "grad_norm": 0.08270053565502167, "learning_rate": 8.163987724402934e-07, "loss": -0.0029, "num_tokens": 12384913.0, "reward": 13.734323501586914, "reward_std": 0.8570379614830017, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7574863433837891, "rewards/length2tails_reward/std": 0.3112337589263916, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5709095001220703, "rewards/thermo_reward/std": 0.8499181270599365, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 270.8125, "completions/mean_terminated_length": 270.8125, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 0.09258036874234676, "epoch": 2.842, "frac_reward_zero_std": 0.0, "grad_norm": 0.1705041527748108, "learning_rate": 8.151384387826313e-07, "loss": 0.0069, "num_tokens": 12393611.0, "reward": 13.437908172607422, "reward_std": 1.449847936630249, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8329051733016968, "rewards/length2tails_reward/std": 0.23473943769931793, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.321671962738037, "rewards/thermo_reward/std": 1.2519351243972778, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.5625, "completions/mean_terminated_length": 272.5625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08808799367398024, "epoch": 2.844, "frac_reward_zero_std": 0.0, "grad_norm": 0.0893077403306961, "learning_rate": 8.138784090833577e-07, "loss": 0.008, "num_tokens": 12402365.0, "reward": 13.877182006835938, "reward_std": 0.37219929695129395, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8012359142303467, "rewards/length2tails_reward/std": 0.22292174398899078, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.875, "completions/mean_terminated_length": 271.875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.08956156764179468, "epoch": 2.846, "frac_reward_zero_std": 0.0, "grad_norm": 0.10173401236534119, "learning_rate": 8.126186854142751e-07, "loss": 0.0061, "num_tokens": 12411097.0, "reward": 13.678201675415039, "reward_std": 0.5565140247344971, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8057146668434143, "rewards/length2tails_reward/std": 0.2487732470035553, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.50996470451355, "rewards/thermo_reward/std": 0.5615194439888, "step": 1423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.5625, "completions/mean_terminated_length": 271.5625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08605887554585934, "epoch": 2.848, "frac_reward_zero_std": 0.0, "grad_norm": 0.07781686633825302, "learning_rate": 8.11359269846683e-07, "loss": -0.0006, "num_tokens": 12419819.0, "reward": 13.564287185668945, "reward_std": 1.3377314805984497, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7410205602645874, "rewards/length2tails_reward/std": 0.24903921782970428, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4298787117004395, "rewards/thermo_reward/std": 1.1900341510772705, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.1875, "completions/mean_terminated_length": 273.1875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.0928757581859827, "epoch": 2.85, "frac_reward_zero_std": 0.0, "grad_norm": 0.11290508508682251, "learning_rate": 8.101001644513731e-07, "loss": 0.0043, "num_tokens": 12428593.0, "reward": 13.877665519714355, "reward_std": 0.37265855073928833, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8060607314109802, "rewards/length2tails_reward/std": 0.24503538012504578, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 283.53125, "completions/mean_terminated_length": 283.53125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09894910361617804, "epoch": 2.852, "frac_reward_zero_std": 0.0, "grad_norm": 0.3717043995857239, "learning_rate": 8.088413712986279e-07, "loss": -0.0102, "num_tokens": 12437698.0, "reward": 13.021846771240234, "reward_std": 3.988328695297241, "rewards/fitness_reward/mean": 7.037449836730957, "rewards/fitness_reward/std": 1.8313246965408325, "rewards/kidney_reward/mean": 2.465315341949463, "rewards/kidney_reward/std": 0.7676700353622437, "rewards/length2tails_reward/mean": 0.7774168252944946, "rewards/length2tails_reward/std": 0.2898247241973877, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3413400650024414, "rewards/thermo_reward/std": 1.4518558979034424, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.21875, "completions/mean_terminated_length": 273.21875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.0861760089173913, "epoch": 2.854, "frac_reward_zero_std": 0.0, "grad_norm": 0.10765953361988068, "learning_rate": 8.075828924582168e-07, "loss": 0.0008, "num_tokens": 12446473.0, "reward": 13.878617286682129, "reward_std": 0.3746810257434845, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8155795335769653, "rewards/length2tails_reward/std": 0.25068867206573486, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.03125, "completions/mean_terminated_length": 272.03125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08908339031040668, "epoch": 2.856, "frac_reward_zero_std": 0.0, "grad_norm": 0.11010465025901794, "learning_rate": 8.063247299993918e-07, "loss": -0.0018, "num_tokens": 12455210.0, "reward": 13.054346084594727, "reward_std": 2.4582793712615967, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.4899649620056152, "rewards/kidney_reward/std": 0.631715714931488, "rewards/length2tails_reward/mean": 0.777817964553833, "rewards/length2tails_reward/std": 0.24456126987934113, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.082923412322998, "rewards/thermo_reward/std": 1.7510353326797485, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.0625, "completions/mean_terminated_length": 272.0625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10221673734486103, "epoch": 2.858, "frac_reward_zero_std": 0.0, "grad_norm": 0.10918369144201279, "learning_rate": 8.05066885990885e-07, "loss": -0.0024, "num_tokens": 12463948.0, "reward": 13.874887466430664, "reward_std": 0.38162562251091003, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7782790064811707, "rewards/length2tails_reward/std": 0.2557736933231354, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.90625, "completions/mean_terminated_length": 273.90625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.07712917029857635, "epoch": 2.86, "frac_reward_zero_std": 0.0, "grad_norm": 0.08574642241001129, "learning_rate": 8.03809362500905e-07, "loss": 0.001, "num_tokens": 12472745.0, "reward": 13.801877975463867, "reward_std": 0.47052812576293945, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8459126353263855, "rewards/length2tails_reward/std": 0.2445998638868332, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 271.59375, "completions/mean_terminated_length": 271.59375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.0909727094694972, "epoch": 2.862, "frac_reward_zero_std": 0.0, "grad_norm": 0.10815263539552689, "learning_rate": 8.025521615971329e-07, "loss": 0.0008, "num_tokens": 12481468.0, "reward": 13.79372787475586, "reward_std": 0.47003355622291565, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7644029855728149, "rewards/length2tails_reward/std": 0.22795794904232025, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 284.90625, "completions/mean_terminated_length": 284.90625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09156226739287376, "epoch": 2.864, "frac_reward_zero_std": 0.0, "grad_norm": 0.36374738812446594, "learning_rate": 8.012952853467202e-07, "loss": -0.0119, "num_tokens": 12490617.0, "reward": 13.561015129089355, "reward_std": 1.168427586555481, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7868642807006836, "rewards/length2tails_reward/std": 0.2786427438259125, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.422022819519043, "rewards/thermo_reward/std": 1.0360568761825562, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.75, "completions/mean_terminated_length": 272.75, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.092946563847363, "epoch": 2.866, "frac_reward_zero_std": 0.0, "grad_norm": 0.2058488130569458, "learning_rate": 8.000387358162834e-07, "loss": 0.0008, "num_tokens": 12499377.0, "reward": 12.085588455200195, "reward_std": 4.778906345367432, "rewards/fitness_reward/mean": 6.6559247970581055, "rewards/fitness_reward/std": 2.7804884910583496, "rewards/kidney_reward/mean": 2.268216371536255, "rewards/kidney_reward/std": 1.1071895360946655, "rewards/length2tails_reward/mean": 0.7589725255966187, "rewards/length2tails_reward/std": 0.3145608901977539, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.98555064201355, "rewards/thermo_reward/std": 1.6683814525604248, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.25, "completions/mean_terminated_length": 273.25, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08142790850251913, "epoch": 2.868, "frac_reward_zero_std": 0.0, "grad_norm": 0.08400456607341766, "learning_rate": 7.98782515071903e-07, "loss": 0.001, "num_tokens": 12508153.0, "reward": 13.958139419555664, "reward_std": 0.22377903759479523, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8130799531936646, "rewards/length2tails_reward/std": 0.24826769530773163, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7891664505004883, "rewards/thermo_reward/std": 0.225629061460495, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.34375, "completions/mean_terminated_length": 273.34375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08738233055919409, "epoch": 2.87, "frac_reward_zero_std": 0.0, "grad_norm": 0.10952440649271011, "learning_rate": 7.975266251791184e-07, "loss": -0.0004, "num_tokens": 12516932.0, "reward": 13.879385948181152, "reward_std": 0.3758961260318756, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8232645988464355, "rewards/length2tails_reward/std": 0.21714530885219574, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.6875, "completions/mean_terminated_length": 272.6875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08811240177601576, "epoch": 2.872, "frac_reward_zero_std": 0.0, "grad_norm": 0.2943911850452423, "learning_rate": 7.962710682029244e-07, "loss": -0.0014, "num_tokens": 12525690.0, "reward": 13.523576736450195, "reward_std": 1.5703537464141846, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7819285988807678, "rewards/length2tails_reward/std": 0.25686317682266235, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3850791454315186, "rewards/thermo_reward/std": 1.4264148473739624, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.96875, "completions/mean_terminated_length": 273.96875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0862605981528759, "epoch": 2.874, "frac_reward_zero_std": 0.0, "grad_norm": 0.10683736950159073, "learning_rate": 7.950158462077697e-07, "loss": 0.0024, "num_tokens": 12534489.0, "reward": 13.596105575561523, "reward_std": 1.2210086584091187, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8528375625610352, "rewards/length2tails_reward/std": 0.22560538351535797, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.45051646232605, "rewards/thermo_reward/std": 1.083735466003418, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.84375, "completions/mean_terminated_length": 271.84375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09131797589361668, "epoch": 2.876, "frac_reward_zero_std": 0.0, "grad_norm": 0.089107945561409, "learning_rate": 7.937609612575511e-07, "loss": 0.0047, "num_tokens": 12543220.0, "reward": 13.794271469116211, "reward_std": 0.4652538597583771, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7698436975479126, "rewards/length2tails_reward/std": 0.20634371042251587, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.46875, "completions/mean_terminated_length": 272.46875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08963609486818314, "epoch": 2.878, "frac_reward_zero_std": 0.0, "grad_norm": 0.08300493657588959, "learning_rate": 7.925064154156114e-07, "loss": -0.0056, "num_tokens": 12551971.0, "reward": 13.066444396972656, "reward_std": 3.2766692638397217, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.511730670928955, "rewards/kidney_reward/std": 0.5132253766059875, "rewards/length2tails_reward/mean": 0.7815616130828857, "rewards/length2tails_reward/std": 0.26363322138786316, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.323504686355591, "rewards/thermo_reward/std": 1.4982414245605469, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.34375, "completions/mean_terminated_length": 273.34375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08487140201032162, "epoch": 2.88, "frac_reward_zero_std": 0.0, "grad_norm": 0.14700040221214294, "learning_rate": 7.912522107447366e-07, "loss": 0.0016, "num_tokens": 12560750.0, "reward": 13.799543380737305, "reward_std": 0.46718329191207886, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8225642442703247, "rewards/length2tails_reward/std": 0.2352154701948166, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.6296226978302, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.59375, "completions/mean_terminated_length": 271.59375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "entropy": 0.08915096707642078, "epoch": 2.882, "frac_reward_zero_std": 0.0, "grad_norm": 0.12797556817531586, "learning_rate": 7.899983493071506e-07, "loss": 0.0027, "num_tokens": 12569473.0, "reward": 13.651243209838867, "reward_std": 0.6270330548286438, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8097268342971802, "rewards/length2tails_reward/std": 0.28132137656211853, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.50996470451355, "rewards/thermo_reward/std": 0.5615194439888, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.9375, "completions/mean_terminated_length": 272.9375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09121816977858543, "epoch": 2.884, "frac_reward_zero_std": 0.0, "grad_norm": 0.09454463422298431, "learning_rate": 7.887448331645137e-07, "loss": 0.001, "num_tokens": 12578239.0, "reward": 13.510807037353516, "reward_std": 1.0569266080856323, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8264129161834717, "rewards/length2tails_reward/std": 0.21036264300346375, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3678598403930664, "rewards/thermo_reward/std": 0.9316072463989258, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.0625, "completions/mean_terminated_length": 272.0625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09095319919288158, "epoch": 2.886, "frac_reward_zero_std": 0.0, "grad_norm": 0.08146243542432785, "learning_rate": 7.874916643779184e-07, "loss": -0.0042, "num_tokens": 12586977.0, "reward": 12.570428848266602, "reward_std": 3.5493857860565186, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.420412063598633, "rewards/kidney_reward/std": 0.7131741642951965, "rewards/length2tails_reward/mean": 0.7441259622573853, "rewards/length2tails_reward/std": 0.3011590242385864, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.922551393508911, "rewards/thermo_reward/std": 1.802452802658081, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.09375, "completions/mean_terminated_length": 272.09375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.0917142340913415, "epoch": 2.888, "frac_reward_zero_std": 0.0, "grad_norm": 0.0717688724398613, "learning_rate": 7.862388450078854e-07, "loss": 0.0002, "num_tokens": 12595716.0, "reward": 13.552705764770508, "reward_std": 1.637266993522644, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.534823417663574, "rewards/kidney_reward/std": 0.5184864401817322, "rewards/length2tails_reward/mean": 0.7994788885116577, "rewards/length2tails_reward/std": 0.19421547651290894, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4767494201660156, "rewards/thermo_reward/std": 1.2395474910736084, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.15625, "completions/mean_terminated_length": 271.15625, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "entropy": 0.08649328723549843, "epoch": 2.89, "frac_reward_zero_std": 0.0, "grad_norm": 0.09909166395664215, "learning_rate": 7.84986377114362e-07, "loss": -0.001, "num_tokens": 12604425.0, "reward": 13.83226203918457, "reward_std": 0.4334369897842407, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7508825063705444, "rewards/length2tails_reward/std": 0.30833303928375244, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08636385947465897, "epoch": 2.892, "frac_reward_zero_std": 0.0, "grad_norm": 0.10228565335273743, "learning_rate": 7.837342627567165e-07, "loss": -0.0023, "num_tokens": 12613141.0, "reward": 13.536503791809082, "reward_std": 1.9209715127944946, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.500931978225708, "rewards/kidney_reward/std": 0.7102046608924866, "rewards/length2tails_reward/mean": 0.7440629005432129, "rewards/length2tails_reward/std": 0.2628605365753174, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4999806880950928, "rewards/thermo_reward/std": 1.222486138343811, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.0625, "completions/mean_terminated_length": 272.0625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09061552211642265, "epoch": 2.894, "frac_reward_zero_std": 0.0, "grad_norm": 0.5566885471343994, "learning_rate": 7.824825039937368e-07, "loss": -0.0042, "num_tokens": 12621879.0, "reward": 13.104745864868164, "reward_std": 2.5030510425567627, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.4164295196533203, "rewards/kidney_reward/std": 0.8949340581893921, "rewards/length2tails_reward/mean": 0.7900067567825317, "rewards/length2tails_reward/std": 0.22052329778671265, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.205639362335205, "rewards/thermo_reward/std": 1.4034382104873657, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 271.46875, "completions/mean_terminated_length": 271.46875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08591014333069324, "epoch": 2.896, "frac_reward_zero_std": 0.0, "grad_norm": 0.06331221014261246, "learning_rate": 7.81231102883625e-07, "loss": -0.001, "num_tokens": 12630598.0, "reward": 12.890361785888672, "reward_std": 2.966064214706421, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.650642991065979, "rewards/kidney_reward/mean": 2.4184765815734863, "rewards/kidney_reward/std": 0.7438530325889587, "rewards/length2tails_reward/mean": 0.7856767177581787, "rewards/length2tails_reward/std": 0.23383821547031403, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.047151565551758, "rewards/thermo_reward/std": 1.8174599409103394, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.03125, "completions/mean_terminated_length": 272.03125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08721327036619186, "epoch": 2.898, "frac_reward_zero_std": 0.0, "grad_norm": 0.11579962819814682, "learning_rate": 7.799800614839964e-07, "loss": -0.0061, "num_tokens": 12639335.0, "reward": 12.824150085449219, "reward_std": 3.6121666431427, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.3733737468719482, "rewards/kidney_reward/std": 0.9182524681091309, "rewards/length2tails_reward/mean": 0.740673840045929, "rewards/length2tails_reward/std": 0.28297778964042664, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2236557006835938, "rewards/thermo_reward/std": 1.5752650499343872, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.46875, "completions/mean_terminated_length": 272.46875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10069273272529244, "epoch": 2.9, "frac_reward_zero_std": 0.0, "grad_norm": 0.08265271782875061, "learning_rate": 7.787293818518737e-07, "loss": -0.0047, "num_tokens": 12648086.0, "reward": 13.497522354125977, "reward_std": 1.9584171772003174, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5227279663085938, "rewards/kidney_reward/std": 0.5869088172912598, "rewards/length2tails_reward/mean": 0.8091145157814026, "rewards/length2tails_reward/std": 0.2377697378396988, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4326975345611572, "rewards/thermo_reward/std": 1.3785746097564697, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "entropy": 0.092111611738801, "epoch": 2.902, "frac_reward_zero_std": 0.0, "grad_norm": 0.11661788821220398, "learning_rate": 7.774790660436857e-07, "loss": 0.0035, "num_tokens": 12656834.0, "reward": 13.799483299255371, "reward_std": 0.46884268522262573, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8219585418701172, "rewards/length2tails_reward/std": 0.24238049983978271, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.9375, "completions/mean_terminated_length": 271.9375, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "entropy": 0.08617793815210462, "epoch": 2.904, "frac_reward_zero_std": 0.0, "grad_norm": 0.10517249256372452, "learning_rate": 7.762291161152626e-07, "loss": 0.006, "num_tokens": 12665568.0, "reward": 13.836585998535156, "reward_std": 0.4221171736717224, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7941212058067322, "rewards/length2tails_reward/std": 0.2467154860496521, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 272.34375, "completions/mean_terminated_length": 272.34375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09527615550905466, "epoch": 2.906, "frac_reward_zero_std": 0.0, "grad_norm": 0.0897793173789978, "learning_rate": 7.749795341218327e-07, "loss": -0.0031, "num_tokens": 12674315.0, "reward": 13.245508193969727, "reward_std": 2.678555727005005, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.462587356567383, "rewards/kidney_reward/std": 0.7827925682067871, "rewards/length2tails_reward/mean": 0.8029334545135498, "rewards/length2tails_reward/std": 0.25929903984069824, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.298952341079712, "rewards/thermo_reward/std": 1.6578506231307983, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.1875, "completions/mean_terminated_length": 273.1875, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 0.09170580562204123, "epoch": 2.908, "frac_reward_zero_std": 0.0, "grad_norm": 0.17641600966453552, "learning_rate": 7.7373032211802e-07, "loss": 0.0043, "num_tokens": 12683089.0, "reward": 12.860458374023438, "reward_std": 4.126487731933594, "rewards/fitness_reward/mean": 7.037131309509277, "rewards/fitness_reward/std": 1.8331266641616821, "rewards/kidney_reward/mean": 2.3966851234436035, "rewards/kidney_reward/std": 0.8604899048805237, "rewards/length2tails_reward/mean": 0.854651153087616, "rewards/length2tails_reward/std": 0.21645672619342804, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.241177558898926, "rewards/thermo_reward/std": 1.6034048795700073, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.28125, "completions/mean_terminated_length": 272.28125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09060567431151867, "epoch": 2.91, "frac_reward_zero_std": 0.0, "grad_norm": 0.19656902551651, "learning_rate": 7.724814821578395e-07, "loss": 0.001, "num_tokens": 12691834.0, "reward": 12.79423999786377, "reward_std": 4.301056861877441, "rewards/fitness_reward/mean": 7.017977237701416, "rewards/fitness_reward/std": 1.9414762258529663, "rewards/kidney_reward/mean": 2.345012664794922, "rewards/kidney_reward/std": 1.0418504476547241, "rewards/length2tails_reward/mean": 0.7869172096252441, "rewards/length2tails_reward/std": 0.2412094920873642, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2525582313537598, "rewards/thermo_reward/std": 1.6153535842895508, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.21875, "completions/mean_terminated_length": 272.21875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.0849033072590828, "epoch": 2.912, "frac_reward_zero_std": 0.0, "grad_norm": 0.07628481835126877, "learning_rate": 7.712330162946948e-07, "loss": 0.0008, "num_tokens": 12700577.0, "reward": 13.83536434173584, "reward_std": 0.4307062029838562, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7819132208824158, "rewards/length2tails_reward/std": 0.2246393859386444, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 271.90625, "completions/mean_terminated_length": 271.90625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09288029000163078, "epoch": 2.914, "frac_reward_zero_std": 0.0, "grad_norm": 0.08784768730401993, "learning_rate": 7.699849265813743e-07, "loss": -0.0026, "num_tokens": 12709310.0, "reward": 13.766115188598633, "reward_std": 0.5586167573928833, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7618662118911743, "rewards/length2tails_reward/std": 0.27308955788612366, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.875, "completions/mean_terminated_length": 269.875, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "entropy": 0.08489096444100142, "epoch": 2.916, "frac_reward_zero_std": 0.0, "grad_norm": 0.07928179949522018, "learning_rate": 7.687372150700479e-07, "loss": 0.001, "num_tokens": 12717978.0, "reward": 13.911661148071289, "reward_std": 0.3134597837924957, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7471531629562378, "rewards/length2tails_reward/std": 0.2685222327709198, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.09375, "completions/mean_terminated_length": 272.09375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.07988280057907104, "epoch": 2.918, "frac_reward_zero_std": 0.0, "grad_norm": 0.0837949886918068, "learning_rate": 7.674898838122638e-07, "loss": 0.0008, "num_tokens": 12726717.0, "reward": 13.914548873901367, "reward_std": 0.3166908025741577, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7760345339775085, "rewards/length2tails_reward/std": 0.21083956956863403, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.8125, "completions/mean_terminated_length": 273.8125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09522326570004225, "epoch": 2.92, "frac_reward_zero_std": 0.0, "grad_norm": 0.11612401902675629, "learning_rate": 7.662429348589446e-07, "loss": -0.0007, "num_tokens": 12735511.0, "reward": 13.207071304321289, "reward_std": 2.0308852195739746, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4523303508758545, "rewards/kidney_reward/std": 0.6995065212249756, "rewards/length2tails_reward/mean": 0.8716757297515869, "rewards/length2tails_reward/std": 0.21875134110450745, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.206387996673584, "rewards/thermo_reward/std": 1.3996530771255493, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 271.8125, "completions/mean_terminated_length": 271.8125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.085783283226192, "epoch": 2.922, "frac_reward_zero_std": 0.0, "grad_norm": 0.10536955296993256, "learning_rate": 7.649963702603848e-07, "loss": 0.0012, "num_tokens": 12744241.0, "reward": 13.83349609375, "reward_std": 0.429172158241272, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7632323503494263, "rewards/length2tails_reward/std": 0.25650447607040405, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 272.6875, "completions/mean_terminated_length": 272.6875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.09125344175845385, "epoch": 2.924, "frac_reward_zero_std": 0.0, "grad_norm": 0.08487284928560257, "learning_rate": 7.637501920662468e-07, "loss": -0.0004, "num_tokens": 12752999.0, "reward": 13.840091705322266, "reward_std": 0.42784279584884644, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8291807174682617, "rewards/length2tails_reward/std": 0.18743930757045746, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.90625, "completions/mean_terminated_length": 272.90625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09677456505596638, "epoch": 2.926, "frac_reward_zero_std": 0.0, "grad_norm": 0.09227211773395538, "learning_rate": 7.62504402325557e-07, "loss": -0.0005, "num_tokens": 12761764.0, "reward": 13.854175567626953, "reward_std": 0.4863269329071045, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8447530269622803, "rewards/length2tails_reward/std": 0.19483210146427155, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.125, "completions/mean_terminated_length": 271.125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08659609872847795, "epoch": 2.928, "frac_reward_zero_std": 0.0, "grad_norm": 0.06487475335597992, "learning_rate": 7.612590030867039e-07, "loss": -0.0053, "num_tokens": 12770472.0, "reward": 13.283210754394531, "reward_std": 2.0315680503845215, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.504581928253174, "rewards/kidney_reward/std": 0.5519188642501831, "rewards/length2tails_reward/mean": 0.6866386532783508, "rewards/length2tails_reward/std": 0.31638386845588684, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2487802505493164, "rewards/thermo_reward/std": 1.5519336462020874, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.40625, "completions/mean_terminated_length": 272.40625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08706388808786869, "epoch": 2.93, "frac_reward_zero_std": 0.0, "grad_norm": 0.21007883548736572, "learning_rate": 7.60013996397434e-07, "loss": -0.0046, "num_tokens": 12779221.0, "reward": 13.285353660583496, "reward_std": 2.5396311283111572, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.4273016452789307, "rewards/kidney_reward/std": 0.9792560338973999, "rewards/length2tails_reward/mean": 0.7961028218269348, "rewards/length2tails_reward/std": 0.23903968930244446, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.374765634536743, "rewards/thermo_reward/std": 1.2762339115142822, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.125, "completions/mean_terminated_length": 272.125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08097695140168071, "epoch": 2.932, "frac_reward_zero_std": 0.0, "grad_norm": 0.08966746181249619, "learning_rate": 7.587693843048474e-07, "loss": 0.0017, "num_tokens": 12787961.0, "reward": 13.874987602233887, "reward_std": 0.3732488453388214, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7792847156524658, "rewards/length2tails_reward/std": 0.24024541676044464, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.34375, "completions/mean_terminated_length": 270.34375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.09143519587814808, "epoch": 2.934, "frac_reward_zero_std": 0.0, "grad_norm": 0.23096249997615814, "learning_rate": 7.575251688553963e-07, "loss": 0.0076, "num_tokens": 12796644.0, "reward": 13.87496566772461, "reward_std": 0.37357449531555176, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7790652513504028, "rewards/length2tails_reward/std": 0.3010183274745941, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.53125, "completions/mean_terminated_length": 272.53125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08806372992694378, "epoch": 2.936, "frac_reward_zero_std": 0.0, "grad_norm": 0.09937624633312225, "learning_rate": 7.5628135209488e-07, "loss": 0.0019, "num_tokens": 12805397.0, "reward": 13.958802223205566, "reward_std": 0.22333241999149323, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.81971675157547, "rewards/length2tails_reward/std": 0.19025535881519318, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7891664505004883, "rewards/thermo_reward/std": 0.225629061460495, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.96875, "completions/mean_terminated_length": 273.96875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0997593542560935, "epoch": 2.9379999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.09114154428243637, "learning_rate": 7.550379360684434e-07, "loss": 0.0028, "num_tokens": 12814196.0, "reward": 13.84214973449707, "reward_std": 0.4247521460056305, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8497633934020996, "rewards/length2tails_reward/std": 0.2314579039812088, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.53125, "completions/mean_terminated_length": 270.53125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08079738728702068, "epoch": 2.94, "frac_reward_zero_std": 0.0, "grad_norm": 0.11101573705673218, "learning_rate": 7.537949228205708e-07, "loss": -0.0035, "num_tokens": 12822885.0, "reward": 13.57027816772461, "reward_std": 1.9160974025726318, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.5229597091674805, "rewards/kidney_reward/std": 0.5855976939201355, "rewards/length2tails_reward/mean": 0.6812267303466797, "rewards/length2tails_reward/std": 0.30966007709503174, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5755205154418945, "rewards/thermo_reward/std": 1.0178793668746948, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.28125, "completions/mean_terminated_length": 272.28125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08315122686326504, "epoch": 2.942, "frac_reward_zero_std": 0.0, "grad_norm": 0.1604270339012146, "learning_rate": 7.525523143950858e-07, "loss": 0.0001, "num_tokens": 12831630.0, "reward": 12.32827377319336, "reward_std": 6.565638065338135, "rewards/fitness_reward/mean": 6.622296333312988, "rewards/fitness_reward/std": 2.912938117980957, "rewards/kidney_reward/mean": 2.213806390762329, "rewards/kidney_reward/std": 1.6312694549560547, "rewards/length2tails_reward/mean": 0.7732738256454468, "rewards/length2tails_reward/std": 0.273733913898468, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3148441314697266, "rewards/thermo_reward/std": 2.026035785675049, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.08468564599752426, "epoch": 2.944, "frac_reward_zero_std": 0.0, "grad_norm": 0.07568921148777008, "learning_rate": 7.513101128351453e-07, "loss": -0.0028, "num_tokens": 12840378.0, "reward": 13.400533676147461, "reward_std": 2.264967203140259, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.509674072265625, "rewards/kidney_reward/std": 0.6607514023780823, "rewards/length2tails_reward/mean": 0.7946181297302246, "rewards/length2tails_reward/std": 0.21099165081977844, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.407722234725952, "rewards/thermo_reward/std": 1.3061423301696777, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.1875, "completions/mean_terminated_length": 271.1875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08182617742568254, "epoch": 2.9459999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.09725268185138702, "learning_rate": 7.500683201832382e-07, "loss": -0.0047, "num_tokens": 12849088.0, "reward": 13.759465217590332, "reward_std": 0.5583043098449707, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.6953686475753784, "rewards/length2tails_reward/std": 0.29025208950042725, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 274.15625, "completions/mean_terminated_length": 274.15625, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.08809699164703488, "epoch": 2.948, "frac_reward_zero_std": 0.0, "grad_norm": 0.12728384137153625, "learning_rate": 7.488269384811799e-07, "loss": 0.0048, "num_tokens": 12857893.0, "reward": 13.71151351928711, "reward_std": 0.6438538432121277, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8883048892021179, "rewards/length2tails_reward/std": 0.11851312965154648, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897367000579834, "rewards/thermo_reward/std": 0.5061467885971069, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.53125, "completions/mean_terminated_length": 271.53125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08566201105713844, "epoch": 2.95, "frac_reward_zero_std": 0.0, "grad_norm": 0.16148102283477783, "learning_rate": 7.475859697701109e-07, "loss": -0.0034, "num_tokens": 12866614.0, "reward": 13.455978393554688, "reward_std": 1.2935900688171387, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7256759405136108, "rewards/length2tails_reward/std": 0.26900699734687805, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.323105812072754, "rewards/thermo_reward/std": 1.1443455219268799, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.84375, "completions/mean_terminated_length": 272.84375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.09163531567901373, "epoch": 2.952, "frac_reward_zero_std": 0.0, "grad_norm": 0.10324077308177948, "learning_rate": 7.463454160904927e-07, "loss": -0.0018, "num_tokens": 12875377.0, "reward": 13.806316375732422, "reward_std": 0.47567877173423767, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8902928829193115, "rewards/length2tails_reward/std": 0.12258625030517578, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.09375, "completions/mean_terminated_length": 273.09375, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.08791532320901752, "epoch": 2.9539999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.32206058502197266, "learning_rate": 7.451052794821039e-07, "loss": 0.0041, "num_tokens": 12884148.0, "reward": 13.934386253356934, "reward_std": 0.3780563771724701, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8491396903991699, "rewards/length2tails_reward/std": 0.1631694883108139, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7891664505004883, "rewards/thermo_reward/std": 0.225629061460495, "step": 1477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.375, "completions/mean_terminated_length": 273.375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08659681957215071, "epoch": 2.956, "frac_reward_zero_std": 0.0, "grad_norm": 0.07478148490190506, "learning_rate": 7.438655619840375e-07, "loss": 0.0005, "num_tokens": 12892928.0, "reward": 13.849531173706055, "reward_std": 0.48496684432029724, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7983073592185974, "rewards/length2tails_reward/std": 0.2785865366458893, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.65625, "completions/mean_terminated_length": 272.65625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08749987743794918, "epoch": 2.958, "frac_reward_zero_std": 0.0, "grad_norm": 0.1306840181350708, "learning_rate": 7.426262656346978e-07, "loss": 0.0027, "num_tokens": 12901685.0, "reward": 13.835283279418945, "reward_std": 0.4283444285392761, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7810969352722168, "rewards/length2tails_reward/std": 0.24721378087997437, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.65625, "completions/mean_terminated_length": 272.65625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08540861960500479, "epoch": 2.96, "frac_reward_zero_std": 0.0, "grad_norm": 0.14198768138885498, "learning_rate": 7.413873924717956e-07, "loss": 0.001, "num_tokens": 12910442.0, "reward": 12.985288619995117, "reward_std": 2.4996986389160156, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.452592670917511, "rewards/kidney_reward/mean": 2.395813465118408, "rewards/kidney_reward/std": 0.8154330849647522, "rewards/length2tails_reward/mean": 0.7963902950286865, "rewards/length2tails_reward/std": 0.2482798546552658, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.163670063018799, "rewards/thermo_reward/std": 1.6197998523712158, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.03125, "completions/mean_terminated_length": 273.03125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09230809472501278, "epoch": 2.9619999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.18161533772945404, "learning_rate": 7.401489445323472e-07, "loss": -0.0017, "num_tokens": 12919211.0, "reward": 13.677942276000977, "reward_std": 0.5635209083557129, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8031283617019653, "rewards/length2tails_reward/std": 0.25591933727264404, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.50996470451355, "rewards/thermo_reward/std": 0.5615194439888, "step": 1481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.03125, "completions/mean_terminated_length": 272.03125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08371663466095924, "epoch": 2.964, "frac_reward_zero_std": 0.0, "grad_norm": 0.09580846130847931, "learning_rate": 7.389109238526685e-07, "loss": -0.0065, "num_tokens": 12927948.0, "reward": 13.760322570800781, "reward_std": 1.1355624198913574, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7880313396453857, "rewards/length2tails_reward/std": 0.24053345620632172, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.593855619430542, "rewards/thermo_reward/std": 1.1204947233200073, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 272.4375, "completions/mean_terminated_length": 272.4375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09112018626183271, "epoch": 2.966, "frac_reward_zero_std": 0.0, "grad_norm": 0.11008063703775406, "learning_rate": 7.376733324683739e-07, "loss": 0.0004, "num_tokens": 12936698.0, "reward": 13.719146728515625, "reward_std": 0.5334470868110657, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.816321611404419, "rewards/length2tails_reward/std": 0.219462051987648, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5498504638671875, "rewards/thermo_reward/std": 0.5360844731330872, "step": 1483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 272.21875, "completions/mean_terminated_length": 272.21875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09244338981807232, "epoch": 2.968, "frac_reward_zero_std": 0.0, "grad_norm": 0.14250656962394714, "learning_rate": 7.364361724143713e-07, "loss": 0.0016, "num_tokens": 12945441.0, "reward": 13.672819137573242, "reward_std": 0.9924048781394958, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7941070795059204, "rewards/length2tails_reward/std": 0.2598108947277069, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.533102512359619, "rewards/thermo_reward/std": 0.8569762706756592, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.4375, "completions/mean_terminated_length": 270.4375, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "entropy": 0.08355463948100805, "epoch": 2.9699999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.37764373421669006, "learning_rate": 7.351994457248595e-07, "loss": -0.0126, "num_tokens": 12954127.0, "reward": 13.909736633300781, "reward_std": 0.31327277421951294, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7279137969017029, "rewards/length2tails_reward/std": 0.26513198018074036, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.4375, "completions/mean_terminated_length": 271.4375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09196582902222872, "epoch": 2.972, "frac_reward_zero_std": 0.0, "grad_norm": 0.11685548722743988, "learning_rate": 7.33963154433325e-07, "loss": -0.0045, "num_tokens": 12962845.0, "reward": 13.807863235473633, "reward_std": 1.0549274682998657, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7532917857170105, "rewards/length2tails_reward/std": 0.27080824971199036, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.6722283363342285, "rewards/thermo_reward/std": 0.8871302604675293, "step": 1486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.125, "completions/mean_terminated_length": 273.125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0840097526088357, "epoch": 2.974, "frac_reward_zero_std": 0.0, "grad_norm": 0.09556731581687927, "learning_rate": 7.327273005725378e-07, "loss": 0.0022, "num_tokens": 12971617.0, "reward": 13.643620491027832, "reward_std": 1.7801275253295898, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.527078866958618, "rewards/kidney_reward/std": 0.5622953772544861, "rewards/length2tails_reward/mean": 0.8075554966926575, "rewards/length2tails_reward/std": 0.2638454735279083, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.574601173400879, "rewards/thermo_reward/std": 1.2273809909820557, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.53125, "completions/mean_terminated_length": 273.53125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08960465341806412, "epoch": 2.976, "frac_reward_zero_std": 0.0, "grad_norm": 0.16316860914230347, "learning_rate": 7.314918861745491e-07, "loss": 0.0006, "num_tokens": 12980402.0, "reward": 13.456883430480957, "reward_std": 1.9875643253326416, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.508101224899292, "rewards/kidney_reward/std": 0.6696491241455078, "rewards/length2tails_reward/mean": 0.8753653764724731, "rewards/length2tails_reward/std": 0.1597120761871338, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4575695991516113, "rewards/thermo_reward/std": 1.0479161739349365, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.0, "completions/mean_terminated_length": 271.0, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.0878768339753151, "epoch": 2.9779999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.10435619950294495, "learning_rate": 7.302569132706881e-07, "loss": -0.0044, "num_tokens": 12989106.0, "reward": 13.693092346191406, "reward_std": 1.0937471389770508, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.650642991065979, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7829598188400269, "rewards/length2tails_reward/std": 0.27791085839271545, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.71875, "completions/mean_terminated_length": 270.71875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09225155226886272, "epoch": 2.98, "frac_reward_zero_std": 0.0, "grad_norm": 0.08390070497989655, "learning_rate": 7.290223838915568e-07, "loss": 0.0025, "num_tokens": 12997801.0, "reward": 13.758573532104492, "reward_std": 0.5511883497238159, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.6864538192749023, "rewards/length2tails_reward/std": 0.25705966353416443, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.28125, "completions/mean_terminated_length": 271.28125, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.08846852649003267, "epoch": 2.982, "frac_reward_zero_std": 0.0, "grad_norm": 0.10731520503759384, "learning_rate": 7.277883000670288e-07, "loss": -0.0101, "num_tokens": 13006514.0, "reward": 13.743173599243164, "reward_std": 0.8424339294433594, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8157355189323425, "rewards/length2tails_reward/std": 0.1975812315940857, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5739362239837646, "rewards/thermo_reward/std": 0.8346105217933655, "step": 1491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.21875, "completions/mean_terminated_length": 272.21875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09590337425470352, "epoch": 2.984, "frac_reward_zero_std": 0.0, "grad_norm": 0.11507551372051239, "learning_rate": 7.26554663826245e-07, "loss": 0.0046, "num_tokens": 13015257.0, "reward": 13.836891174316406, "reward_std": 0.4224011301994324, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7971822023391724, "rewards/length2tails_reward/std": 0.2093581259250641, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.4375, "completions/mean_terminated_length": 272.4375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08532546181231737, "epoch": 2.9859999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.72013258934021, "learning_rate": 7.2532147719761e-07, "loss": 0.0053, "num_tokens": 13024007.0, "reward": 12.834997177124023, "reward_std": 4.0108962059021, "rewards/fitness_reward/mean": 6.9856367111206055, "rewards/fitness_reward/std": 1.8179237842559814, "rewards/kidney_reward/mean": 2.371399402618408, "rewards/kidney_reward/std": 1.0544934272766113, "rewards/length2tails_reward/mean": 0.7837235927581787, "rewards/length2tails_reward/std": 0.26284104585647583, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2995893955230713, "rewards/thermo_reward/std": 1.2621272802352905, "step": 1493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.03125, "completions/mean_terminated_length": 270.03125, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 0.0831696605309844, "epoch": 2.988, "frac_reward_zero_std": 0.0, "grad_norm": 0.190970778465271, "learning_rate": 7.240887422087891e-07, "loss": -0.0258, "num_tokens": 13032680.0, "reward": 13.309157371520996, "reward_std": 2.8543763160705566, "rewards/fitness_reward/mean": 6.997875690460205, "rewards/fitness_reward/std": 2.0551881790161133, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8210108876228333, "rewards/length2tails_reward/std": 0.20082461833953857, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.530060291290283, "rewards/thermo_reward/std": 0.8719301223754883, "step": 1494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.34375, "completions/mean_terminated_length": 273.34375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08507030922919512, "epoch": 2.99, "frac_reward_zero_std": 0.0, "grad_norm": 0.08497648686170578, "learning_rate": 7.228564608867061e-07, "loss": 0.0068, "num_tokens": 13041459.0, "reward": 13.772529602050781, "reward_std": 0.5473827719688416, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8260170221328735, "rewards/length2tails_reward/std": 0.22926293313503265, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.6296226978302, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.75, "completions/mean_terminated_length": 272.75, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08261603862047195, "epoch": 2.992, "frac_reward_zero_std": 0.0, "grad_norm": 0.10478515177965164, "learning_rate": 7.216246352575369e-07, "loss": -0.0042, "num_tokens": 13050219.0, "reward": 13.836133003234863, "reward_std": 0.43924397230148315, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.789600133895874, "rewards/length2tails_reward/std": 0.2798590064048767, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.96875, "completions/mean_terminated_length": 271.96875, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "entropy": 0.09019718458876014, "epoch": 2.9939999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.09406401962041855, "learning_rate": 7.2039326734671e-07, "loss": -0.0024, "num_tokens": 13058954.0, "reward": 13.52038288116455, "reward_std": 2.2548234462738037, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.512369394302368, "rewards/kidney_reward/std": 0.6455049514770508, "rewards/length2tails_reward/mean": 0.7954090237617493, "rewards/length2tails_reward/std": 0.24171698093414307, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.524796962738037, "rewards/thermo_reward/std": 1.2938830852508545, "step": 1497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.25, "completions/mean_terminated_length": 273.25, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.07924879249185324, "epoch": 2.996, "frac_reward_zero_std": 0.0, "grad_norm": 0.06274835765361786, "learning_rate": 7.191623591789005e-07, "loss": -0.0041, "num_tokens": 13067730.0, "reward": 13.637685775756836, "reward_std": 1.2025954723358154, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8336925506591797, "rewards/length2tails_reward/std": 0.2251635044813156, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4940104484558105, "rewards/thermo_reward/std": 1.054375171661377, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 273.75, "completions/mean_terminated_length": 273.75, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08210170036181808, "epoch": 2.998, "frac_reward_zero_std": 0.0, "grad_norm": 0.18217602372169495, "learning_rate": 7.179319127780274e-07, "loss": 0.002, "num_tokens": 13076522.0, "reward": 13.59107494354248, "reward_std": 1.8719508647918701, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.5338897705078125, "rewards/kidney_reward/std": 0.5237670540809631, "rewards/length2tails_reward/mean": 0.8339365720748901, "rewards/length2tails_reward/std": 0.22332796454429626, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.570115804672241, "rewards/thermo_reward/std": 1.0470080375671387, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.8125, "completions/mean_terminated_length": 273.8125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08131207153201103, "epoch": 3.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.11286388337612152, "learning_rate": 7.167019301672508e-07, "loss": 0.0017, "num_tokens": 13085316.0, "reward": 13.719484329223633, "reward_std": 0.9667587876319885, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8409682512283325, "rewards/length2tails_reward/std": 0.2269384115934372, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5750818252563477, "rewards/thermo_reward/std": 0.8288360238075256, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.84375, "completions/mean_terminated_length": 271.84375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08663884736597538, "epoch": 3.002, "frac_reward_zero_std": 0.0, "grad_norm": 0.07008332759141922, "learning_rate": 7.154724133689676e-07, "loss": -0.0032, "num_tokens": 13094047.0, "reward": 13.87397575378418, "reward_std": 0.384539395570755, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7691654562950134, "rewards/length2tails_reward/std": 0.2182990461587906, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.4375, "completions/mean_terminated_length": 272.4375, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "entropy": 0.08829734660685062, "epoch": 3.004, "frac_reward_zero_std": 0.0, "grad_norm": 0.10279041528701782, "learning_rate": 7.142433644048098e-07, "loss": 0.0024, "num_tokens": 13102797.0, "reward": 13.54155158996582, "reward_std": 1.5787534713745117, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4996724128723145, "rewards/kidney_reward/std": 0.5786310434341431, "rewards/length2tails_reward/mean": 0.8198412656784058, "rewards/length2tails_reward/std": 0.2560720145702362, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4987106323242188, "rewards/thermo_reward/std": 1.0301309823989868, "step": 1502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 271.65625, "completions/mean_terminated_length": 271.65625, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 0.10298821609467268, "epoch": 3.006, "frac_reward_zero_std": 0.0, "grad_norm": 0.1507360190153122, "learning_rate": 7.130147852956394e-07, "loss": 0.0024, "num_tokens": 13111522.0, "reward": 13.954971313476562, "reward_std": 0.22310635447502136, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7814017534255981, "rewards/length2tails_reward/std": 0.24012959003448486, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7891664505004883, "rewards/thermo_reward/std": 0.225629061460495, "step": 1503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 272.5625, "completions/mean_terminated_length": 272.5625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08291113376617432, "epoch": 3.008, "frac_reward_zero_std": 0.0, "grad_norm": 0.09166242182254791, "learning_rate": 7.11786678061546e-07, "loss": 0.0017, "num_tokens": 13120276.0, "reward": 13.839548110961914, "reward_std": 0.4274718165397644, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8237426280975342, "rewards/length2tails_reward/std": 0.223821222782135, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 276.78125, "completions/mean_terminated_length": 276.78125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09564272407442331, "epoch": 3.01, "frac_reward_zero_std": 0.0, "grad_norm": 0.13374081254005432, "learning_rate": 7.105590447218437e-07, "loss": -0.0031, "num_tokens": 13129165.0, "reward": 13.877220153808594, "reward_std": 0.3734629452228546, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.801613986492157, "rewards/length2tails_reward/std": 0.25205737352371216, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.09375, "completions/mean_terminated_length": 272.09375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08292873064056039, "epoch": 3.012, "frac_reward_zero_std": 0.0, "grad_norm": 0.2027139514684677, "learning_rate": 7.093318872950664e-07, "loss": -0.0017, "num_tokens": 13137904.0, "reward": 13.6582670211792, "reward_std": 0.6689882874488831, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7547006607055664, "rewards/length2tails_reward/std": 0.28135377168655396, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5498504638671875, "rewards/thermo_reward/std": 0.5360844731330872, "step": 1506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.15625, "completions/mean_terminated_length": 273.15625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08827351219952106, "epoch": 3.014, "frac_reward_zero_std": 0.0, "grad_norm": 0.10204388946294785, "learning_rate": 7.081052077989667e-07, "loss": -0.0048, "num_tokens": 13146677.0, "reward": 13.630050659179688, "reward_std": 1.1362738609313965, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.650642991065979, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8250014781951904, "rewards/length2tails_reward/std": 0.24018093943595886, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.0, "completions/mean_terminated_length": 272.0, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0833295239135623, "epoch": 3.016, "frac_reward_zero_std": 0.0, "grad_norm": 0.1260230839252472, "learning_rate": 7.068790082505107e-07, "loss": -0.0019, "num_tokens": 13155413.0, "reward": 13.230853080749512, "reward_std": 1.622039794921875, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7348164916038513, "rewards/length2tails_reward/std": 0.3072350323200226, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.124424934387207, "rewards/thermo_reward/std": 1.4224731922149658, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.28125, "completions/mean_terminated_length": 273.28125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08457625657320023, "epoch": 3.018, "frac_reward_zero_std": 0.0, "grad_norm": 0.12385375797748566, "learning_rate": 7.056532906658752e-07, "loss": 0.0009, "num_tokens": 13164190.0, "reward": 13.543134689331055, "reward_std": 1.7194054126739502, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8602433800697327, "rewards/length2tails_reward/std": 0.20957812666893005, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4543137550354004, "rewards/thermo_reward/std": 1.2629039287567139, "step": 1509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.1875, "completions/mean_terminated_length": 272.1875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.08766997698694468, "epoch": 3.02, "frac_reward_zero_std": 0.0, "grad_norm": 0.1102355569601059, "learning_rate": 7.04428057060445e-07, "loss": -0.0051, "num_tokens": 13172932.0, "reward": 13.820863723754883, "reward_std": 0.574556827545166, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7852233648300171, "rewards/length2tails_reward/std": 0.20834431052207947, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09607646521180868, "epoch": 3.022, "frac_reward_zero_std": 0.0, "grad_norm": 0.08452524989843369, "learning_rate": 7.032033094488093e-07, "loss": -0.0032, "num_tokens": 13181676.0, "reward": 13.797369956970215, "reward_std": 0.47734662890434265, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8008310794830322, "rewards/length2tails_reward/std": 0.2595970034599304, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.90625, "completions/mean_terminated_length": 272.90625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09418830275535583, "epoch": 3.024, "frac_reward_zero_std": 0.0, "grad_norm": 0.1552160084247589, "learning_rate": 7.019790498447571e-07, "loss": 0.0019, "num_tokens": 13190441.0, "reward": 13.758134841918945, "reward_std": 0.49969205260276794, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8073362112045288, "rewards/length2tails_reward/std": 0.2395055890083313, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.9375, "completions/mean_terminated_length": 270.9375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0807279646396637, "epoch": 3.026, "frac_reward_zero_std": 0.0, "grad_norm": 0.10100645571947098, "learning_rate": 7.007552802612764e-07, "loss": 0.0006, "num_tokens": 13199143.0, "reward": 13.867382049560547, "reward_std": 0.3748721182346344, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.70323646068573, "rewards/length2tails_reward/std": 0.28744640946388245, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 274.6875, "completions/mean_terminated_length": 274.6875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08788215974345803, "epoch": 3.028, "frac_reward_zero_std": 0.0, "grad_norm": 0.13024696707725525, "learning_rate": 6.995320027105481e-07, "loss": -0.0022, "num_tokens": 13207965.0, "reward": 13.626249313354492, "reward_std": 1.1537022590637207, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8483192920684814, "rewards/length2tails_reward/std": 0.22150689363479614, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5084705352783203, "rewards/thermo_reward/std": 0.9801769852638245, "step": 1514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 271.5, "completions/mean_terminated_length": 271.5, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09518216829746962, "epoch": 3.03, "frac_reward_zero_std": 0.0, "grad_norm": 0.09895037114620209, "learning_rate": 6.983092192039455e-07, "loss": 0.0021, "num_tokens": 13216685.0, "reward": 13.582860946655273, "reward_std": 1.8713279962539673, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.538771867752075, "rewards/kidney_reward/std": 0.49614986777305603, "rewards/length2tails_reward/mean": 0.7530010342597961, "rewards/length2tails_reward/std": 0.24346208572387695, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5076041221618652, "rewards/thermo_reward/std": 1.3884414434432983, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.5, "completions/mean_terminated_length": 273.5, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08413621364161372, "epoch": 3.032, "frac_reward_zero_std": 0.0, "grad_norm": 0.10287663340568542, "learning_rate": 6.970869317520279e-07, "loss": 0.0008, "num_tokens": 13225469.0, "reward": 13.882369995117188, "reward_std": 0.38048040866851807, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8531134128570557, "rewards/length2tails_reward/std": 0.20098592340946198, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.3125, "completions/mean_terminated_length": 272.3125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08602802455425262, "epoch": 3.034, "frac_reward_zero_std": 0.0, "grad_norm": 0.12859375774860382, "learning_rate": 6.958651423645407e-07, "loss": -0.0047, "num_tokens": 13234215.0, "reward": 13.630465507507324, "reward_std": 0.917415976524353, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7645262479782104, "rewards/length2tails_reward/std": 0.28300315141677856, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4937076568603516, "rewards/thermo_reward/std": 0.8699823021888733, "step": 1517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 754.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 287.40625, "completions/mean_terminated_length": 272.3548278808594, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10801392560824752, "epoch": 3.036, "frac_reward_zero_std": 0.0, "grad_norm": 1.9083175659179688, "learning_rate": 6.946438530504093e-07, "loss": -0.0247, "num_tokens": 13243444.0, "reward": 13.646073341369629, "reward_std": 0.9348779320716858, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7570778727531433, "rewards/length2tails_reward/std": 0.2936254143714905, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.482700824737549, "rewards/thermo_reward/std": 0.9230156540870667, "step": 1518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 270.5625, "completions/mean_terminated_length": 270.5625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.08214408811181784, "epoch": 3.038, "frac_reward_zero_std": 0.0, "grad_norm": 0.1576862931251526, "learning_rate": 6.934230658177372e-07, "loss": 0.0011, "num_tokens": 13252134.0, "reward": 13.114895820617676, "reward_std": 4.055248260498047, "rewards/fitness_reward/mean": 7.030417442321777, "rewards/fitness_reward/std": 1.8711055517196655, "rewards/kidney_reward/mean": 2.4575986862182617, "rewards/kidney_reward/std": 0.9553350806236267, "rewards/length2tails_reward/mean": 0.7602359652519226, "rewards/length2tails_reward/std": 0.27951622009277344, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4508557319641113, "rewards/thermo_reward/std": 1.2813252210617065, "step": 1519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.53125, "completions/mean_terminated_length": 271.53125, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.08832096960395575, "epoch": 3.04, "frac_reward_zero_std": 0.0, "grad_norm": 0.07851015776395798, "learning_rate": 6.922027826738017e-07, "loss": -0.0059, "num_tokens": 13260855.0, "reward": 13.069082260131836, "reward_std": 2.6473140716552734, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.45493483543396, "rewards/kidney_reward/std": 0.6855012774467468, "rewards/length2tails_reward/mean": 0.7882065773010254, "rewards/length2tails_reward/std": 0.26846760511398315, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.131650447845459, "rewards/thermo_reward/std": 1.86585533618927, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.65625, "completions/mean_terminated_length": 271.65625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08753087185323238, "epoch": 3.042, "frac_reward_zero_std": 0.0, "grad_norm": 0.09079719334840775, "learning_rate": 6.909830056250526e-07, "loss": 0.0031, "num_tokens": 13269580.0, "reward": 13.12080192565918, "reward_std": 2.2184596061706543, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.354142189025879, "rewards/kidney_reward/std": 0.8856872916221619, "rewards/length2tails_reward/mean": 0.706944465637207, "rewards/length2tails_reward/std": 0.328614205121994, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2922892570495605, "rewards/thermo_reward/std": 1.2992886304855347, "step": 1521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.5, "completions/mean_terminated_length": 272.5, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09137775283306837, "epoch": 3.044, "frac_reward_zero_std": 0.0, "grad_norm": 0.13857701420783997, "learning_rate": 6.897637366771066e-07, "loss": 0.0034, "num_tokens": 13278332.0, "reward": 13.755581855773926, "reward_std": 0.5007432103157043, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.781810998916626, "rewards/length2tails_reward/std": 0.25280410051345825, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 1522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.84375, "completions/mean_terminated_length": 271.84375, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "entropy": 0.0889419955201447, "epoch": 3.046, "frac_reward_zero_std": 0.0, "grad_norm": 0.06803309172391891, "learning_rate": 6.885449778347448e-07, "loss": -0.0041, "num_tokens": 13287063.0, "reward": 13.599407196044922, "reward_std": 1.1936312913894653, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7993759512901306, "rewards/length2tails_reward/std": 0.2781977951526642, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4591641426086426, "rewards/thermo_reward/std": 1.0398609638214111, "step": 1523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.65625, "completions/mean_terminated_length": 271.65625, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 0.08468360500410199, "epoch": 3.048, "frac_reward_zero_std": 0.0, "grad_norm": 0.10303880274295807, "learning_rate": 6.8732673110191e-07, "loss": 0.0065, "num_tokens": 13295788.0, "reward": 13.873723983764648, "reward_std": 0.3729260563850403, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7666493654251099, "rewards/length2tails_reward/std": 0.28587543964385986, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.8125, "completions/mean_terminated_length": 271.8125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.07929568085819483, "epoch": 3.05, "frac_reward_zero_std": 0.0, "grad_norm": 0.08844693750143051, "learning_rate": 6.861089984817032e-07, "loss": -0.0001, "num_tokens": 13304518.0, "reward": 13.651402473449707, "reward_std": 0.730636477470398, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5263874530792236, "rewards/kidney_reward/std": 0.5662070512771606, "rewards/length2tails_reward/mean": 0.7409303784370422, "rewards/length2tails_reward/std": 0.30849483609199524, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.28125, "completions/mean_terminated_length": 272.28125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09142199764028192, "epoch": 3.052, "frac_reward_zero_std": 0.0, "grad_norm": 0.08430390805006027, "learning_rate": 6.848917819763793e-07, "loss": 0.002, "num_tokens": 13313263.0, "reward": 13.95113754272461, "reward_std": 0.222835972905159, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7430736422538757, "rewards/length2tails_reward/std": 0.27778905630111694, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7891664505004883, "rewards/thermo_reward/std": 0.225629061460495, "step": 1526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08689901698380709, "epoch": 3.054, "frac_reward_zero_std": 0.0, "grad_norm": 0.2107875794172287, "learning_rate": 6.836750835873453e-07, "loss": -0.0049, "num_tokens": 13322011.0, "reward": 13.468440055847168, "reward_std": 3.0067617893218994, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.5390896797180176, "rewards/kidney_reward/std": 0.49435171484947205, "rewards/length2tails_reward/mean": 0.806360125541687, "rewards/length2tails_reward/std": 0.24003668129444122, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.69566011428833, "rewards/thermo_reward/std": 0.7545809149742126, "step": 1527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.125, "completions/mean_terminated_length": 270.125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "entropy": 0.09294356871396303, "epoch": 3.056, "frac_reward_zero_std": 0.0, "grad_norm": 0.1360463947057724, "learning_rate": 6.824589053151557e-07, "loss": -0.0019, "num_tokens": 13330687.0, "reward": 13.681583404541016, "reward_std": 0.9170136451721191, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7322138547897339, "rewards/length2tails_reward/std": 0.2564483880996704, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.520697593688965, "rewards/thermo_reward/std": 0.9184462428092957, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 271.1875, "completions/mean_terminated_length": 271.1875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.07564508123323321, "epoch": 3.058, "frac_reward_zero_std": 0.0, "grad_norm": 0.09548187255859375, "learning_rate": 6.812432491595102e-07, "loss": -0.0056, "num_tokens": 13339397.0, "reward": 13.341715812683105, "reward_std": 2.5133228302001953, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.447655439376831, "rewards/kidney_reward/std": 1.011582612991333, "rewards/length2tails_reward/mean": 0.7018471360206604, "rewards/length2tails_reward/std": 0.3438743054866791, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3626902103424072, "rewards/thermo_reward/std": 1.6830047369003296, "step": 1529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08310075104236603, "epoch": 3.06, "frac_reward_zero_std": 0.0, "grad_norm": 0.10243958979845047, "learning_rate": 6.800281171192501e-07, "loss": 0.0017, "num_tokens": 13348161.0, "reward": 13.800646781921387, "reward_std": 0.4690954089164734, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8335978984832764, "rewards/length2tails_reward/std": 0.16988824307918549, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 272.59375, "completions/mean_terminated_length": 272.59375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09586561750620604, "epoch": 3.062, "frac_reward_zero_std": 0.0, "grad_norm": 0.11107802391052246, "learning_rate": 6.788135111923545e-07, "loss": -0.0035, "num_tokens": 13356916.0, "reward": 13.770654678344727, "reward_std": 0.5631916522979736, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8072643876075745, "rewards/length2tails_reward/std": 0.28409481048583984, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.15625, "completions/mean_terminated_length": 273.15625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08622128795832396, "epoch": 3.064, "frac_reward_zero_std": 0.0, "grad_norm": 0.08057373017072678, "learning_rate": 6.775994333759378e-07, "loss": -0.0001, "num_tokens": 13365689.0, "reward": 13.850677490234375, "reward_std": 0.4851415753364563, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.809777021408081, "rewards/length2tails_reward/std": 0.24094976484775543, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.6875, "completions/mean_terminated_length": 272.6875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.07591812824830413, "epoch": 3.066, "frac_reward_zero_std": 0.0, "grad_norm": 0.10544803738594055, "learning_rate": 6.763858856662457e-07, "loss": -0.0028, "num_tokens": 13374447.0, "reward": 13.523006439208984, "reward_std": 1.351911187171936, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5063438415527344, "rewards/kidney_reward/std": 0.5423585772514343, "rewards/length2tails_reward/mean": 0.7550817728042603, "rewards/length2tails_reward/std": 0.2964111268520355, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4799704551696777, "rewards/thermo_reward/std": 1.1274259090423584, "step": 1533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.59375, "completions/mean_terminated_length": 272.59375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.0872881724499166, "epoch": 3.068, "frac_reward_zero_std": 0.0, "grad_norm": 0.10545146465301514, "learning_rate": 6.751728700586525e-07, "loss": -0.0006, "num_tokens": 13383202.0, "reward": 13.916288375854492, "reward_std": 0.31205329298973083, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7934369444847107, "rewards/length2tails_reward/std": 0.24310320615768433, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 272.0, "completions/mean_terminated_length": 272.0, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.0812803409062326, "epoch": 3.07, "frac_reward_zero_std": 0.0, "grad_norm": 0.08104109019041061, "learning_rate": 6.739603885476582e-07, "loss": -0.0019, "num_tokens": 13391938.0, "reward": 13.794636726379395, "reward_std": 0.4737315773963928, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7734951376914978, "rewards/length2tails_reward/std": 0.24882858991622925, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.09241558611392975, "epoch": 3.072, "frac_reward_zero_std": 0.0, "grad_norm": 0.17455969750881195, "learning_rate": 6.727484431268831e-07, "loss": 0.0065, "num_tokens": 13400682.0, "reward": 13.918707847595215, "reward_std": 0.3102504312992096, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.817625105381012, "rewards/length2tails_reward/std": 0.24556082487106323, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.71875, "completions/mean_terminated_length": 272.71875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.08524061739444733, "epoch": 3.074, "frac_reward_zero_std": 0.0, "grad_norm": 0.07834436744451523, "learning_rate": 6.715370357890678e-07, "loss": -0.0058, "num_tokens": 13409441.0, "reward": 13.360005378723145, "reward_std": 2.4934613704681396, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.650642991065979, "rewards/kidney_reward/mean": 2.484485626220703, "rewards/kidney_reward/std": 0.8032392859458923, "rewards/length2tails_reward/mean": 0.7930335998535156, "rewards/length2tails_reward/std": 0.2633064091205597, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.450049877166748, "rewards/thermo_reward/std": 1.0861150026321411, "step": 1537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 273.6875, "completions/mean_terminated_length": 273.6875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10845900978893042, "epoch": 3.076, "frac_reward_zero_std": 0.0, "grad_norm": 0.1100066602230072, "learning_rate": 6.703261685260663e-07, "loss": -0.001, "num_tokens": 13418231.0, "reward": 13.392513275146484, "reward_std": 1.8563884496688843, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.473618507385254, "rewards/kidney_reward/std": 0.5860860347747803, "rewards/length2tails_reward/mean": 0.7303594350814819, "rewards/length2tails_reward/std": 0.29636603593826294, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.384673833847046, "rewards/thermo_reward/std": 1.30971097946167, "step": 1538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.875, "completions/mean_terminated_length": 271.875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08378116134554148, "epoch": 3.078, "frac_reward_zero_std": 0.0, "grad_norm": 0.13051924109458923, "learning_rate": 6.691158433288464e-07, "loss": 0.0019, "num_tokens": 13426963.0, "reward": 13.707208633422852, "reward_std": 1.1767970323562622, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7400823831558228, "rewards/length2tails_reward/std": 0.2969076633453369, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5728940963745117, "rewards/thermo_reward/std": 1.0320227146148682, "step": 1539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 270.84375, "completions/mean_terminated_length": 270.84375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.09347997140139341, "epoch": 3.08, "frac_reward_zero_std": 0.0, "grad_norm": 0.14912846684455872, "learning_rate": 6.679060621874833e-07, "loss": 0.0012, "num_tokens": 13435662.0, "reward": 13.875182151794434, "reward_std": 0.38105764985084534, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7812291979789734, "rewards/length2tails_reward/std": 0.24451898038387299, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 274.90625, "completions/mean_terminated_length": 274.90625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08737827092409134, "epoch": 3.082, "frac_reward_zero_std": 0.0, "grad_norm": 0.12639953196048737, "learning_rate": 6.666968270911584e-07, "loss": -0.0018, "num_tokens": 13444491.0, "reward": 13.922403335571289, "reward_std": 0.3234083950519562, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.854577898979187, "rewards/length2tails_reward/std": 0.2517944872379303, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.34375, "completions/mean_terminated_length": 272.34375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08177088387310505, "epoch": 3.084, "frac_reward_zero_std": 0.0, "grad_norm": 0.11636722087860107, "learning_rate": 6.654881400281547e-07, "loss": -0.0042, "num_tokens": 13453238.0, "reward": 13.915096282958984, "reward_std": 0.32327255606651306, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7815147638320923, "rewards/length2tails_reward/std": 0.2819795310497284, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.84375, "completions/mean_terminated_length": 270.84375, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "entropy": 0.09405922424048185, "epoch": 3.086, "frac_reward_zero_std": 0.0, "grad_norm": 0.1301419883966446, "learning_rate": 6.642800029858546e-07, "loss": -0.0003, "num_tokens": 13461937.0, "reward": 13.520773887634277, "reward_std": 1.081948161125183, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4921867847442627, "rewards/kidney_reward/std": 0.6195381283760071, "rewards/length2tails_reward/mean": 0.77314692735672, "rewards/length2tails_reward/std": 0.26673877239227295, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4900870323181152, "rewards/thermo_reward/std": 0.8872950673103333, "step": 1543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.8125, "completions/mean_terminated_length": 272.8125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08489370346069336, "epoch": 3.088, "frac_reward_zero_std": 0.0, "grad_norm": 0.11313661932945251, "learning_rate": 6.630724179507361e-07, "loss": 0.0013, "num_tokens": 13470699.0, "reward": 13.793017387390137, "reward_std": 0.4702511131763458, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7573058009147644, "rewards/length2tails_reward/std": 0.3235591948032379, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.9375, "completions/mean_terminated_length": 271.9375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08466823026537895, "epoch": 3.09, "frac_reward_zero_std": 0.0, "grad_norm": 0.11544471979141235, "learning_rate": 6.618653869083688e-07, "loss": 0.0016, "num_tokens": 13479433.0, "reward": 12.832748413085938, "reward_std": 4.525689125061035, "rewards/fitness_reward/mean": 6.952455520629883, "rewards/fitness_reward/std": 2.0028762817382812, "rewards/kidney_reward/mean": 2.3317272663116455, "rewards/kidney_reward/std": 1.091876745223999, "rewards/length2tails_reward/mean": 0.7750874757766724, "rewards/length2tails_reward/std": 0.2632858455181122, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.371056318283081, "rewards/thermo_reward/std": 1.5435597896575928, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.0625, "completions/mean_terminated_length": 272.0625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0935161211527884, "epoch": 3.092, "frac_reward_zero_std": 0.0, "grad_norm": 0.23990941047668457, "learning_rate": 6.606589118434125e-07, "loss": 0.0025, "num_tokens": 13488171.0, "reward": 13.167928695678711, "reward_std": 2.527303695678711, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.3861141204833984, "rewards/kidney_reward/std": 0.8551732897758484, "rewards/length2tails_reward/mean": 0.7293937802314758, "rewards/length2tails_reward/std": 0.3044939637184143, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.30519962310791, "rewards/thermo_reward/std": 1.4556901454925537, "step": 1546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.4375, "completions/mean_terminated_length": 273.4375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.0900156507268548, "epoch": 3.094, "frac_reward_zero_std": 0.0, "grad_norm": 0.12602326273918152, "learning_rate": 6.59452994739612e-07, "loss": 0.0002, "num_tokens": 13496953.0, "reward": 13.653825759887695, "reward_std": 0.6284289956092834, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8355517387390137, "rewards/length2tails_reward/std": 0.2260853499174118, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.50996470451355, "rewards/thermo_reward/std": 0.5615194439888, "step": 1547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.71875, "completions/mean_terminated_length": 271.71875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08785140514373779, "epoch": 3.096, "frac_reward_zero_std": 0.0, "grad_norm": 0.09885193407535553, "learning_rate": 6.582476375797948e-07, "loss": -0.0035, "num_tokens": 13505680.0, "reward": 13.804637908935547, "reward_std": 0.5242769122123718, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7482430934906006, "rewards/length2tails_reward/std": 0.25076186656951904, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 274.6875, "completions/mean_terminated_length": 274.6875, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "entropy": 0.0945708341896534, "epoch": 3.098, "frac_reward_zero_std": 0.0, "grad_norm": 0.5140098333358765, "learning_rate": 6.570428423458686e-07, "loss": -0.0029, "num_tokens": 13514502.0, "reward": 12.935726165771484, "reward_std": 5.054202556610107, "rewards/fitness_reward/mean": 6.980704307556152, "rewards/fitness_reward/std": 2.1523258686065674, "rewards/kidney_reward/mean": 2.3935017585754395, "rewards/kidney_reward/std": 1.3179216384887695, "rewards/length2tails_reward/mean": 0.7184333205223083, "rewards/length2tails_reward/std": 0.29144835472106934, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.389676094055176, "rewards/thermo_reward/std": 1.6116918325424194, "step": 1549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.21875, "completions/mean_terminated_length": 272.21875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09089232515543699, "epoch": 3.1, "frac_reward_zero_std": 0.0, "grad_norm": 0.18398801982402802, "learning_rate": 6.558386110188157e-07, "loss": -0.0039, "num_tokens": 13523245.0, "reward": 13.343172073364258, "reward_std": 2.6493773460388184, "rewards/fitness_reward/mean": 7.026922702789307, "rewards/fitness_reward/std": 1.8908731937408447, "rewards/kidney_reward/mean": 2.510338068008423, "rewards/kidney_reward/std": 0.5207419395446777, "rewards/length2tails_reward/mean": 0.7628942728042603, "rewards/length2tails_reward/std": 0.2707509398460388, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.15625, "completions/mean_terminated_length": 273.15625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08175284508615732, "epoch": 3.102, "frac_reward_zero_std": 0.0, "grad_norm": 0.0869038999080658, "learning_rate": 6.546349455786925e-07, "loss": -0.005, "num_tokens": 13532018.0, "reward": 13.078606605529785, "reward_std": 2.4494495391845703, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.470146656036377, "rewards/kidney_reward/std": 0.6043990254402161, "rewards/length2tails_reward/mean": 0.8006786108016968, "rewards/length2tails_reward/std": 0.2602311670780182, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1247167587280273, "rewards/thermo_reward/std": 1.6955657005310059, "step": 1551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08147422038018703, "epoch": 3.104, "frac_reward_zero_std": 0.0, "grad_norm": 0.08447594940662384, "learning_rate": 6.534318480046239e-07, "loss": -0.0035, "num_tokens": 13540734.0, "reward": 13.59555435180664, "reward_std": 1.7914360761642456, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.519392490386963, "rewards/kidney_reward/std": 0.6057767271995544, "rewards/length2tails_reward/mean": 0.7064756155014038, "rewards/length2tails_reward/std": 0.27535876631736755, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.601839065551758, "rewards/thermo_reward/std": 0.8774381279945374, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.75, "completions/mean_terminated_length": 272.75, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08858750760555267, "epoch": 3.106, "frac_reward_zero_std": 0.0, "grad_norm": 0.1847735047340393, "learning_rate": 6.522293202748017e-07, "loss": 0.0004, "num_tokens": 13549494.0, "reward": 13.918962478637695, "reward_std": 0.3105127811431885, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8201719522476196, "rewards/length2tails_reward/std": 0.1941031664609909, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.625, "completions/mean_terminated_length": 271.625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0853114714846015, "epoch": 3.108, "frac_reward_zero_std": 0.0, "grad_norm": 0.17123565077781677, "learning_rate": 6.51027364366481e-07, "loss": 0.0022, "num_tokens": 13558218.0, "reward": 13.52690601348877, "reward_std": 1.9610837697982788, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.5366315841674805, "rewards/kidney_reward/std": 0.5082566142082214, "rewards/length2tails_reward/mean": 0.7408733367919922, "rewards/length2tails_reward/std": 0.27830973267555237, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5125112533569336, "rewards/thermo_reward/std": 1.155271291732788, "step": 1554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 268.625, "completions/mean_terminated_length": 268.625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.0793598871678114, "epoch": 3.11, "frac_reward_zero_std": 0.0, "grad_norm": 0.09068351984024048, "learning_rate": 6.498259822559757e-07, "loss": 0.0019, "num_tokens": 13566846.0, "reward": 13.65500259399414, "reward_std": 0.8623346090316772, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7111403942108154, "rewards/length2tails_reward/std": 0.30966800451278687, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4962234497070312, "rewards/thermo_reward/std": 0.8580347895622253, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.96875, "completions/mean_terminated_length": 273.96875, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 0.08767529763281345, "epoch": 3.112, "frac_reward_zero_std": 0.0, "grad_norm": 0.08908183872699738, "learning_rate": 6.486251759186572e-07, "loss": -0.0025, "num_tokens": 13575645.0, "reward": 13.632354736328125, "reward_std": 1.063725471496582, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8941877484321594, "rewards/length2tails_reward/std": 0.16848689317703247, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4826302528381348, "rewards/thermo_reward/std": 0.9233596324920654, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08610057085752487, "epoch": 3.114, "frac_reward_zero_std": 0.0, "grad_norm": 0.14670053124427795, "learning_rate": 6.474249473289497e-07, "loss": 0.0057, "num_tokens": 13584393.0, "reward": 13.596689224243164, "reward_std": 1.6090404987335205, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5318994522094727, "rewards/kidney_reward/std": 0.535025417804718, "rewards/length2tails_reward/mean": 0.807540774345398, "rewards/length2tails_reward/std": 0.24658679962158203, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5228500366210938, "rewards/thermo_reward/std": 1.1001607179641724, "step": 1557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.5625, "completions/mean_terminated_length": 272.5625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08467079140245914, "epoch": 3.116, "frac_reward_zero_std": 0.0, "grad_norm": 0.15976735949516296, "learning_rate": 6.462252984603276e-07, "loss": 0.0062, "num_tokens": 13593147.0, "reward": 13.846799850463867, "reward_std": 0.475908488035202, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7709941267967224, "rewards/length2tails_reward/std": 0.25571998953819275, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.07919182069599628, "epoch": 3.118, "frac_reward_zero_std": 0.0, "grad_norm": 0.11993545293807983, "learning_rate": 6.45026231285312e-07, "loss": 0.0006, "num_tokens": 13601911.0, "reward": 13.699793815612793, "reward_std": 1.1293576955795288, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7839176058769226, "rewards/length2tails_reward/std": 0.28801000118255615, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5884556770324707, "rewards/thermo_reward/std": 0.9485320448875427, "step": 1559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.3125, "completions/mean_terminated_length": 273.3125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08516907598823309, "epoch": 3.12, "frac_reward_zero_std": 0.0, "grad_norm": 0.10007896274328232, "learning_rate": 6.438277477754678e-07, "loss": -0.0011, "num_tokens": 13610689.0, "reward": 13.881974220275879, "reward_std": 0.37886977195739746, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8491480350494385, "rewards/length2tails_reward/std": 0.19460441172122955, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.375, "completions/mean_terminated_length": 273.375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08439195808023214, "epoch": 3.122, "frac_reward_zero_std": 0.0, "grad_norm": 0.11381310224533081, "learning_rate": 6.426298499013993e-07, "loss": 0.0018, "num_tokens": 13619469.0, "reward": 13.816393852233887, "reward_std": 0.5212529897689819, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8657984137535095, "rewards/length2tails_reward/std": 0.16835640370845795, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.59375, "completions/mean_terminated_length": 272.59375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.07988157263025641, "epoch": 3.124, "frac_reward_zero_std": 0.0, "grad_norm": 0.1909734606742859, "learning_rate": 6.414325396327492e-07, "loss": 0.003, "num_tokens": 13628224.0, "reward": 12.954696655273438, "reward_std": 5.199808120727539, "rewards/fitness_reward/mean": 6.977269172668457, "rewards/fitness_reward/std": 2.171755075454712, "rewards/kidney_reward/mean": 2.3789546489715576, "rewards/kidney_reward/std": 1.4002131223678589, "rewards/length2tails_reward/mean": 0.7894188165664673, "rewards/length2tails_reward/std": 0.25925904512405396, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.419530153274536, "rewards/thermo_reward/std": 1.6613085269927979, "step": 1562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.28125, "completions/mean_terminated_length": 273.28125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08113757288083434, "epoch": 3.126, "frac_reward_zero_std": 0.0, "grad_norm": 0.08598976582288742, "learning_rate": 6.402358189381933e-07, "loss": -0.0033, "num_tokens": 13637001.0, "reward": 13.570723533630371, "reward_std": 1.384864091873169, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7958351373672485, "rewards/length2tails_reward/std": 0.32051900029182434, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.458193302154541, "rewards/thermo_reward/std": 1.1682724952697754, "step": 1563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.65625, "completions/mean_terminated_length": 270.65625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.08710565231740475, "epoch": 3.128, "frac_reward_zero_std": 0.0, "grad_norm": 0.13959741592407227, "learning_rate": 6.390396897854378e-07, "loss": 0.0063, "num_tokens": 13645694.0, "reward": 13.752948760986328, "reward_std": 0.5005505681037903, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.755474328994751, "rewards/length2tails_reward/std": 0.2612459063529968, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.46875, "completions/mean_terminated_length": 273.46875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08403254486620426, "epoch": 3.13, "frac_reward_zero_std": 0.0, "grad_norm": 0.09504576772451401, "learning_rate": 6.37844154141217e-07, "loss": 0.0023, "num_tokens": 13654477.0, "reward": 13.760905265808105, "reward_std": 0.5017654299736023, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.835037887096405, "rewards/length2tails_reward/std": 0.23825286328792572, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.78125, "completions/mean_terminated_length": 273.78125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08621383644640446, "epoch": 3.132, "frac_reward_zero_std": 0.0, "grad_norm": 0.11715393513441086, "learning_rate": 6.366492139712885e-07, "loss": -0.0018, "num_tokens": 13663270.0, "reward": 13.812804222106934, "reward_std": 0.5235148072242737, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8299036622047424, "rewards/length2tails_reward/std": 0.23763789236545563, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.0625, "completions/mean_terminated_length": 271.0625, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "entropy": 0.09282642137259245, "epoch": 3.134, "frac_reward_zero_std": 0.0, "grad_norm": 0.12017302215099335, "learning_rate": 6.354548712404313e-07, "loss": -0.0013, "num_tokens": 13671976.0, "reward": 13.878923416137695, "reward_std": 0.3813875913619995, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8186453580856323, "rewards/length2tails_reward/std": 0.22613418102264404, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.09375, "completions/mean_terminated_length": 272.09375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.0821957616135478, "epoch": 3.136, "frac_reward_zero_std": 0.0, "grad_norm": 0.07647687196731567, "learning_rate": 6.342611279124421e-07, "loss": -0.0019, "num_tokens": 13680715.0, "reward": 13.818721771240234, "reward_std": 0.8213350176811218, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8214808702468872, "rewards/length2tails_reward/std": 0.21197476983070374, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.6489083766937256, "rewards/thermo_reward/std": 0.8178472518920898, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 271.40625, "completions/mean_terminated_length": 271.40625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08138279803097248, "epoch": 3.138, "frac_reward_zero_std": 0.0, "grad_norm": 0.08109301328659058, "learning_rate": 6.330679859501315e-07, "loss": -0.0057, "num_tokens": 13689432.0, "reward": 13.03253173828125, "reward_std": 2.44555926322937, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.437547206878662, "rewards/kidney_reward/std": 0.745245099067688, "rewards/length2tails_reward/mean": 0.7165517807006836, "rewards/length2tails_reward/std": 0.28471410274505615, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.0621438026428223, "rewards/thermo_reward/std": 1.7939794063568115, "step": 1569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.28125, "completions/mean_terminated_length": 273.28125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08364815032109618, "epoch": 3.14, "frac_reward_zero_std": 0.0, "grad_norm": 0.06438497453927994, "learning_rate": 6.31875447315322e-07, "loss": -0.0039, "num_tokens": 13698209.0, "reward": 13.600080490112305, "reward_std": 1.6844980716705322, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.500974655151367, "rewards/kidney_reward/std": 0.5715343952178955, "rewards/length2tails_reward/mean": 0.8087210059165955, "rewards/length2tails_reward/std": 0.2439965009689331, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.557048797607422, "rewards/thermo_reward/std": 1.1177526712417603, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.84375, "completions/mean_terminated_length": 271.84375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "entropy": 0.07725761877372861, "epoch": 3.142, "frac_reward_zero_std": 0.0, "grad_norm": 0.06575439870357513, "learning_rate": 6.306835139688438e-07, "loss": -0.0062, "num_tokens": 13706940.0, "reward": 13.157386779785156, "reward_std": 3.1938116550445557, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.4843716621398926, "rewards/kidney_reward/std": 0.5299732089042664, "rewards/length2tails_reward/mean": 0.8027685880661011, "rewards/length2tails_reward/std": 0.2714056968688965, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4396843910217285, "rewards/thermo_reward/std": 1.2497756481170654, "step": 1571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.4375, "completions/mean_terminated_length": 272.4375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08324752282351255, "epoch": 3.144, "frac_reward_zero_std": 0.0, "grad_norm": 0.11135981976985931, "learning_rate": 6.294921878705312e-07, "loss": -0.0039, "num_tokens": 13715690.0, "reward": 13.836538314819336, "reward_std": 0.4356030225753784, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.793649435043335, "rewards/length2tails_reward/std": 0.24881352484226227, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08308309270069003, "epoch": 3.146, "frac_reward_zero_std": 0.0, "grad_norm": 0.09605570137500763, "learning_rate": 6.283014709792214e-07, "loss": 0.0024, "num_tokens": 13724454.0, "reward": 13.760598182678223, "reward_std": 0.5016447901725769, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8319677114486694, "rewards/length2tails_reward/std": 0.1931665539741516, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 1573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.84375, "completions/mean_terminated_length": 272.84375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.0848759338259697, "epoch": 3.148, "frac_reward_zero_std": 0.0, "grad_norm": 0.09683488309383392, "learning_rate": 6.271113652527485e-07, "loss": -0.0002, "num_tokens": 13733217.0, "reward": 13.728259086608887, "reward_std": 0.5810441374778748, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.782167911529541, "rewards/length2tails_reward/std": 0.2472234070301056, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 1574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.40625, "completions/mean_terminated_length": 273.40625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08548160269856453, "epoch": 3.15, "frac_reward_zero_std": 0.0, "grad_norm": 0.12159522622823715, "learning_rate": 6.259218726479427e-07, "loss": -0.0008, "num_tokens": 13741998.0, "reward": 13.842123985290527, "reward_std": 0.43085977435112, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8495111465454102, "rewards/length2tails_reward/std": 0.19437187910079956, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 271.96875, "completions/mean_terminated_length": 271.96875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08964389516040683, "epoch": 3.152, "frac_reward_zero_std": 0.0, "grad_norm": 0.12555015087127686, "learning_rate": 6.247329951206259e-07, "loss": 0.0004, "num_tokens": 13750733.0, "reward": 13.40833854675293, "reward_std": 1.396416425704956, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7877026796340942, "rewards/length2tails_reward/std": 0.2363130748271942, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2692625522613525, "rewards/thermo_reward/std": 1.3035361766815186, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 272.625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08218161016702652, "epoch": 3.154, "frac_reward_zero_std": 0.0, "grad_norm": 0.09898626804351807, "learning_rate": 6.23544734625608e-07, "loss": 0.0033, "num_tokens": 13759489.0, "reward": 13.878093719482422, "reward_std": 0.3739182651042938, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8103445768356323, "rewards/length2tails_reward/std": 0.20301468670368195, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.90625, "completions/mean_terminated_length": 272.90625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08957744669169188, "epoch": 3.156, "frac_reward_zero_std": 0.0, "grad_norm": 0.2144327163696289, "learning_rate": 6.223570931166851e-07, "loss": 0.0001, "num_tokens": 13768254.0, "reward": 13.625322341918945, "reward_std": 1.0448532104492188, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.799221396446228, "rewards/length2tails_reward/std": 0.24112744629383087, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4850940704345703, "rewards/thermo_reward/std": 0.911384642124176, "step": 1578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.46875, "completions/mean_terminated_length": 271.46875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.08084192033857107, "epoch": 3.158, "frac_reward_zero_std": 0.0, "grad_norm": 0.15178348124027252, "learning_rate": 6.211700725466351e-07, "loss": 0.0067, "num_tokens": 13776973.0, "reward": 13.87630558013916, "reward_std": 0.3730323016643524, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7924637794494629, "rewards/length2tails_reward/std": 0.24484948813915253, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.125, "completions/mean_terminated_length": 272.125, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.0818185918033123, "epoch": 3.16, "frac_reward_zero_std": 0.0, "grad_norm": 0.09542899578809738, "learning_rate": 6.199836748672152e-07, "loss": 0.0066, "num_tokens": 13785713.0, "reward": 13.722007751464844, "reward_std": 0.5323172211647034, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8449215292930603, "rewards/length2tails_reward/std": 0.19587825238704681, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5498504638671875, "rewards/thermo_reward/std": 0.5360844731330872, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.53125, "completions/mean_terminated_length": 272.53125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08284872956573963, "epoch": 3.162, "frac_reward_zero_std": 0.0, "grad_norm": 0.13644354045391083, "learning_rate": 6.187979020291583e-07, "loss": 0.0023, "num_tokens": 13794466.0, "reward": 13.477725982666016, "reward_std": 1.6386417150497437, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5256075859069824, "rewards/kidney_reward/std": 0.5706179141998291, "rewards/length2tails_reward/mean": 0.7800761461257935, "rewards/length2tails_reward/std": 0.24554499983787537, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4129247665405273, "rewards/thermo_reward/std": 1.196850061416626, "step": 1581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.1875, "completions/mean_terminated_length": 273.1875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08427475392818451, "epoch": 3.164, "frac_reward_zero_std": 0.0, "grad_norm": 0.10350681841373444, "learning_rate": 6.176127559821698e-07, "loss": 0.0056, "num_tokens": 13803240.0, "reward": 13.82392406463623, "reward_std": 0.5608053207397461, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8158328533172607, "rewards/length2tails_reward/std": 0.2515362501144409, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.5, "completions/mean_terminated_length": 273.5, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09279169328510761, "epoch": 3.166, "frac_reward_zero_std": 0.0, "grad_norm": 0.1039460077881813, "learning_rate": 6.164282386749248e-07, "loss": 0.0003, "num_tokens": 13812024.0, "reward": 13.722917556762695, "reward_std": 0.5369901657104492, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8540178537368774, "rewards/length2tails_reward/std": 0.18012025952339172, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5498504638671875, "rewards/thermo_reward/std": 0.5360844731330872, "step": 1583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.1875, "completions/mean_terminated_length": 271.1875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.09811558667570353, "epoch": 3.168, "frac_reward_zero_std": 0.0, "grad_norm": 0.22617964446544647, "learning_rate": 6.152443520550641e-07, "loss": 0.0045, "num_tokens": 13820734.0, "reward": 13.63374137878418, "reward_std": 1.0049004554748535, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8045656681060791, "rewards/length2tails_reward/std": 0.2367853969335556, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.492978096008301, "rewards/thermo_reward/std": 0.8734596967697144, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.84375, "completions/mean_terminated_length": 272.84375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.07768923044204712, "epoch": 3.17, "frac_reward_zero_std": 0.0, "grad_norm": 0.09221001714468002, "learning_rate": 6.14061098069192e-07, "loss": -0.0039, "num_tokens": 13829497.0, "reward": 13.746610641479492, "reward_std": 0.7933074235916138, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7430543899536133, "rewards/length2tails_reward/std": 0.33624374866485596, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.5625, "completions/mean_terminated_length": 272.5625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.07595672411844134, "epoch": 3.172, "frac_reward_zero_std": 0.0, "grad_norm": 0.10004629194736481, "learning_rate": 6.128784786628721e-07, "loss": 0.0, "num_tokens": 13838251.0, "reward": 13.531187057495117, "reward_std": 2.1729977130889893, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.515078544616699, "rewards/kidney_reward/std": 0.6301804184913635, "rewards/length2tails_reward/mean": 0.7734501361846924, "rewards/length2tails_reward/std": 0.26072344183921814, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.535088539123535, "rewards/thermo_reward/std": 1.2374815940856934, "step": 1586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.5, "completions/mean_terminated_length": 271.5, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "entropy": 0.08676011674106121, "epoch": 3.174, "frac_reward_zero_std": 0.0, "grad_norm": 0.07602910697460175, "learning_rate": 6.116964957806252e-07, "loss": -0.0037, "num_tokens": 13846971.0, "reward": 13.728689193725586, "reward_std": 1.1042555570602417, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7970216274261475, "rewards/length2tails_reward/std": 0.2547018229961395, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5613222122192383, "rewards/thermo_reward/std": 1.0945703983306885, "step": 1587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.25, "completions/mean_terminated_length": 270.25, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.08311831764876842, "epoch": 3.176, "frac_reward_zero_std": 0.0, "grad_norm": 0.07211422175168991, "learning_rate": 6.105151513659248e-07, "loss": -0.0072, "num_tokens": 13855651.0, "reward": 13.759332656860352, "reward_std": 1.151942491531372, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8217657804489136, "rewards/length2tails_reward/std": 0.21550104022026062, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5894906520843506, "rewards/thermo_reward/std": 1.1446915864944458, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.6875, "completions/mean_terminated_length": 272.6875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08453467022627592, "epoch": 3.178, "frac_reward_zero_std": 0.0, "grad_norm": 0.09263108670711517, "learning_rate": 6.093344473611951e-07, "loss": -0.0019, "num_tokens": 13864409.0, "reward": 13.754880905151367, "reward_std": 0.5109540820121765, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7747988700866699, "rewards/length2tails_reward/std": 0.24791789054870605, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 1589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.125, "completions/mean_terminated_length": 273.125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08336858032271266, "epoch": 3.18, "frac_reward_zero_std": 0.0, "grad_norm": 0.09499793499708176, "learning_rate": 6.081543857078075e-07, "loss": -0.0011, "num_tokens": 13873181.0, "reward": 13.66617202758789, "reward_std": 0.6704643368721008, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.833747148513794, "rewards/length2tails_reward/std": 0.21232077479362488, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5498504638671875, "rewards/thermo_reward/std": 0.5360844731330872, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.84375, "completions/mean_terminated_length": 272.84375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08754700422286987, "epoch": 3.182, "frac_reward_zero_std": 0.0, "grad_norm": 0.1134149506688118, "learning_rate": 6.069749683460764e-07, "loss": -0.0025, "num_tokens": 13881944.0, "reward": 13.244776725769043, "reward_std": 1.9161473512649536, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.506251811981201, "rewards/kidney_reward/std": 0.5428574681282043, "rewards/length2tails_reward/mean": 0.8059432506561279, "rewards/length2tails_reward/std": 0.22836993634700775, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1967456340789795, "rewards/thermo_reward/std": 1.510373592376709, "step": 1591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08679799642413855, "epoch": 3.184, "frac_reward_zero_std": 0.0, "grad_norm": 0.13869330286979675, "learning_rate": 6.057961972152578e-07, "loss": 0.0036, "num_tokens": 13890708.0, "reward": 13.838022232055664, "reward_std": 0.42595183849334717, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8084931373596191, "rewards/length2tails_reward/std": 0.2337682992219925, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08560691121965647, "epoch": 3.186, "frac_reward_zero_std": 0.0, "grad_norm": 0.0591062568128109, "learning_rate": 6.046180742535441e-07, "loss": -0.0063, "num_tokens": 13899456.0, "reward": 13.649131774902344, "reward_std": 1.1960653066635132, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7516544461250305, "rewards/length2tails_reward/std": 0.29946169257164, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.513659954071045, "rewards/thermo_reward/std": 1.149131417274475, "step": 1593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.96875, "completions/mean_terminated_length": 270.96875, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "entropy": 0.08243997674435377, "epoch": 3.188, "frac_reward_zero_std": 0.0, "grad_norm": 0.12632687389850616, "learning_rate": 6.034406013980628e-07, "loss": 0.0035, "num_tokens": 13908159.0, "reward": 13.668336868286133, "reward_std": 0.5609111189842224, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7070699334144592, "rewards/length2tails_reward/std": 0.323891818523407, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5099644660949707, "rewards/thermo_reward/std": 0.5615194439888, "step": 1594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 273.03125, "completions/mean_terminated_length": 273.03125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09201567247509956, "epoch": 3.19, "frac_reward_zero_std": 0.0, "grad_norm": 0.6218602657318115, "learning_rate": 6.022637805848723e-07, "loss": 0.0218, "num_tokens": 13916928.0, "reward": 13.492816925048828, "reward_std": 1.2237213850021362, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7628862857818604, "rewards/length2tails_reward/std": 0.2679445147514343, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.356222152709961, "rewards/thermo_reward/std": 1.125559687614441, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.875, "completions/mean_terminated_length": 271.875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08387936279177666, "epoch": 3.192, "frac_reward_zero_std": 0.0, "grad_norm": 0.11393842101097107, "learning_rate": 6.010876137489583e-07, "loss": 0.0029, "num_tokens": 13925660.0, "reward": 13.91272258758545, "reward_std": 0.31023505330085754, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7577773332595825, "rewards/length2tails_reward/std": 0.23585514724254608, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.0865836851298809, "epoch": 3.194, "frac_reward_zero_std": 0.0, "grad_norm": 0.12148909270763397, "learning_rate": 5.999121028242322e-07, "loss": -0.003, "num_tokens": 13934404.0, "reward": 13.874883651733398, "reward_std": 0.38329583406448364, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7782378196716309, "rewards/length2tails_reward/std": 0.25293105840682983, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.34375, "completions/mean_terminated_length": 272.34375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09070562291890383, "epoch": 3.196, "frac_reward_zero_std": 0.0, "grad_norm": 0.12602317333221436, "learning_rate": 5.987372497435258e-07, "loss": -0.0026, "num_tokens": 13943151.0, "reward": 13.795031547546387, "reward_std": 0.4765709638595581, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7774432897567749, "rewards/length2tails_reward/std": 0.265408992767334, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 269.8125, "completions/mean_terminated_length": 269.8125, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "entropy": 0.08400048688054085, "epoch": 3.198, "frac_reward_zero_std": 0.0, "grad_norm": 0.165093332529068, "learning_rate": 5.975630564385901e-07, "loss": 0.0061, "num_tokens": 13951817.0, "reward": 13.735469818115234, "reward_std": 0.6187280416488647, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7290137410163879, "rewards/length2tails_reward/std": 0.24236340820789337, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08165622828528285, "epoch": 3.2, "frac_reward_zero_std": 0.0, "grad_norm": 0.06480145454406738, "learning_rate": 5.96389524840091e-07, "loss": -0.002, "num_tokens": 13960533.0, "reward": 13.647340774536133, "reward_std": 1.2094513177871704, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.728061318397522, "rewards/length2tails_reward/std": 0.2604650855064392, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4868698120117188, "rewards/thermo_reward/std": 1.2013118267059326, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 271.3125, "completions/mean_terminated_length": 271.3125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08520669303834438, "epoch": 3.202, "frac_reward_zero_std": 0.0, "grad_norm": 0.2728014588356018, "learning_rate": 5.952166568776062e-07, "loss": 0.0036, "num_tokens": 13969247.0, "reward": 13.399568557739258, "reward_std": 2.2984795570373535, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.449371814727783, "rewards/kidney_reward/std": 0.8562029600143433, "rewards/length2tails_reward/mean": 0.7222508192062378, "rewards/length2tails_reward/std": 0.28240150213241577, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4167864322662354, "rewards/thermo_reward/std": 1.4644033908843994, "step": 1601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.96875, "completions/mean_terminated_length": 270.96875, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "entropy": 0.09271425474435091, "epoch": 3.204, "frac_reward_zero_std": 0.0, "grad_norm": 0.10972724854946136, "learning_rate": 5.940444544796222e-07, "loss": -0.0014, "num_tokens": 13977950.0, "reward": 13.836832046508789, "reward_std": 0.4348587691783905, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7965885996818542, "rewards/length2tails_reward/std": 0.2626652121543884, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 272.6875, "completions/mean_terminated_length": 272.6875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08314972370862961, "epoch": 3.206, "frac_reward_zero_std": 0.0, "grad_norm": 0.12602828443050385, "learning_rate": 5.928729195735318e-07, "loss": -0.0027, "num_tokens": 13986708.0, "reward": 13.495146751403809, "reward_std": 1.7042378187179565, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.650642991065979, "rewards/kidney_reward/mean": 2.5109715461730957, "rewards/kidney_reward/std": 0.5173211097717285, "rewards/length2tails_reward/mean": 0.8184359073638916, "rewards/length2tails_reward/std": 0.2159198671579361, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5561649799346924, "rewards/thermo_reward/std": 0.925395131111145, "step": 1603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.90625, "completions/mean_terminated_length": 272.90625, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.08746057841926813, "epoch": 3.208, "frac_reward_zero_std": 0.0, "grad_norm": 0.26632049679756165, "learning_rate": 5.917020540856294e-07, "loss": -0.0028, "num_tokens": 13995473.0, "reward": 12.22585678100586, "reward_std": 6.362361431121826, "rewards/fitness_reward/mean": 6.649763107299805, "rewards/fitness_reward/std": 2.8078510761260986, "rewards/kidney_reward/mean": 2.2407450675964355, "rewards/kidney_reward/std": 1.53963041305542, "rewards/length2tails_reward/mean": 0.8339070677757263, "rewards/length2tails_reward/std": 0.22544747591018677, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1519570350646973, "rewards/thermo_reward/std": 2.0436434745788574, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.5625, "completions/mean_terminated_length": 272.5625, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.0865698466077447, "epoch": 3.21, "frac_reward_zero_std": 0.0, "grad_norm": 0.10460247844457626, "learning_rate": 5.905318599411097e-07, "loss": -0.0009, "num_tokens": 14004227.0, "reward": 13.839855194091797, "reward_std": 0.4297863841056824, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8268245458602905, "rewards/length2tails_reward/std": 0.16668696701526642, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.09375, "completions/mean_terminated_length": 270.09375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.07823292072862387, "epoch": 3.212, "frac_reward_zero_std": 0.0, "grad_norm": 0.19168511033058167, "learning_rate": 5.89362339064062e-07, "loss": -0.0016, "num_tokens": 14012902.0, "reward": 13.867661476135254, "reward_std": 0.3797597885131836, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7060228586196899, "rewards/length2tails_reward/std": 0.3517718017101288, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 754.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 288.71875, "completions/mean_terminated_length": 273.70965576171875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.10296936891973019, "epoch": 3.214, "frac_reward_zero_std": 0.0, "grad_norm": 0.4714829623699188, "learning_rate": 5.881934933774701e-07, "loss": -0.0166, "num_tokens": 14022173.0, "reward": 13.924141883850098, "reward_std": 0.3132314682006836, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8719671964645386, "rewards/length2tails_reward/std": 0.1845170557498932, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.09375, "completions/mean_terminated_length": 272.09375, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 0.08533268887549639, "epoch": 3.216, "frac_reward_zero_std": 0.0, "grad_norm": 0.0771593302488327, "learning_rate": 5.870253248032067e-07, "loss": 0.0064, "num_tokens": 14030912.0, "reward": 13.835899353027344, "reward_std": 0.4224575161933899, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7872604131698608, "rewards/length2tails_reward/std": 0.26647889614105225, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.625, "completions/mean_terminated_length": 271.625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08216171525418758, "epoch": 3.218, "frac_reward_zero_std": 0.0, "grad_norm": 0.12162912636995316, "learning_rate": 5.858578352620321e-07, "loss": -0.0009, "num_tokens": 14039636.0, "reward": 13.789643287658691, "reward_std": 0.4721164107322693, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7235633134841919, "rewards/length2tails_reward/std": 0.2913040816783905, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.65625, "completions/mean_terminated_length": 273.65625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.10002035647630692, "epoch": 3.22, "frac_reward_zero_std": 0.0, "grad_norm": 0.05761205032467842, "learning_rate": 5.846910266735889e-07, "loss": -0.0064, "num_tokens": 14048425.0, "reward": 13.744827270507812, "reward_std": 1.2510325908660889, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8414597511291504, "rewards/length2tails_reward/std": 0.23725688457489014, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.600375175476074, "rewards/thermo_reward/std": 1.084394931793213, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.07954447437077761, "epoch": 3.222, "frac_reward_zero_std": 0.0, "grad_norm": 0.12211769074201584, "learning_rate": 5.835249009564012e-07, "loss": 0.0016, "num_tokens": 14057189.0, "reward": 13.800848960876465, "reward_std": 0.46823522448539734, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8356137871742249, "rewards/length2tails_reward/std": 0.18387895822525024, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.28125, "completions/mean_terminated_length": 272.28125, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "entropy": 0.09631557948887348, "epoch": 3.224, "frac_reward_zero_std": 0.0, "grad_norm": 0.3751218616962433, "learning_rate": 5.82359460027869e-07, "loss": -0.0072, "num_tokens": 14065934.0, "reward": 13.086830139160156, "reward_std": 3.9839515686035156, "rewards/fitness_reward/mean": 7.038168907165527, "rewards/fitness_reward/std": 1.8272552490234375, "rewards/kidney_reward/mean": 2.468914747238159, "rewards/kidney_reward/std": 0.8913218379020691, "rewards/length2tails_reward/mean": 0.7623480558395386, "rewards/length2tails_reward/std": 0.2841353416442871, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4035110473632812, "rewards/thermo_reward/std": 1.328399896621704, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.09375, "completions/mean_terminated_length": 272.09375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08313568867743015, "epoch": 3.226, "frac_reward_zero_std": 0.0, "grad_norm": 0.12886476516723633, "learning_rate": 5.811947058042676e-07, "loss": 0.0039, "num_tokens": 14074673.0, "reward": 13.833187103271484, "reward_std": 0.4242973327636719, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.760138988494873, "rewards/length2tails_reward/std": 0.26653754711151123, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.34375, "completions/mean_terminated_length": 273.34375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08375045098364353, "epoch": 3.228, "frac_reward_zero_std": 0.0, "grad_norm": 0.05681309849023819, "learning_rate": 5.800306402007427e-07, "loss": 0.0023, "num_tokens": 14083452.0, "reward": 13.955066680908203, "reward_std": 0.22359304130077362, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7823522090911865, "rewards/length2tails_reward/std": 0.29082199931144714, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7891664505004883, "rewards/thermo_reward/std": 0.225629061460495, "step": 1614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.5, "completions/mean_terminated_length": 270.5, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0816805730573833, "epoch": 3.23, "frac_reward_zero_std": 0.0, "grad_norm": 0.08435562252998352, "learning_rate": 5.788672651313078e-07, "loss": -0.0037, "num_tokens": 14092140.0, "reward": 13.383923530578613, "reward_std": 1.6765855550765991, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.6398475170135498, "rewards/length2tails_reward/std": 0.33842137455940247, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3143515586853027, "rewards/thermo_reward/std": 1.4448281526565552, "step": 1615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.9375, "completions/mean_terminated_length": 272.9375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.07523797173053026, "epoch": 3.232, "frac_reward_zero_std": 0.0, "grad_norm": 0.0788172259926796, "learning_rate": 5.777045825088403e-07, "loss": -0.0042, "num_tokens": 14100906.0, "reward": 13.541112899780273, "reward_std": 1.3472471237182617, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.5384316444396973, "rewards/kidney_reward/std": 0.4980745017528534, "rewards/length2tails_reward/mean": 0.7591254711151123, "rewards/length2tails_reward/std": 0.333173930644989, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.523092269897461, "rewards/thermo_reward/std": 0.906480073928833, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.90625, "completions/mean_terminated_length": 272.90625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.0888227610848844, "epoch": 3.234, "frac_reward_zero_std": 0.0, "grad_norm": 0.13339588046073914, "learning_rate": 5.765425942450801e-07, "loss": 0.0032, "num_tokens": 14109671.0, "reward": 13.539932250976562, "reward_std": 1.5400187969207764, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.534397602081299, "rewards/kidney_reward/std": 0.520893931388855, "rewards/length2tails_reward/mean": 0.8074017763137817, "rewards/length2tails_reward/std": 0.2371595948934555, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.463608741760254, "rewards/thermo_reward/std": 1.1467925310134888, "step": 1617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.84375, "completions/mean_terminated_length": 272.84375, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "entropy": 0.08746557729318738, "epoch": 3.2359999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.19486036896705627, "learning_rate": 5.753813022506247e-07, "loss": -0.0104, "num_tokens": 14118434.0, "reward": 13.883773803710938, "reward_std": 0.3759906589984894, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.867151141166687, "rewards/length2tails_reward/std": 0.20712190866470337, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.40625, "completions/mean_terminated_length": 272.40625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08746451884508133, "epoch": 3.238, "frac_reward_zero_std": 0.0, "grad_norm": 0.10658274590969086, "learning_rate": 5.742207084349273e-07, "loss": -0.0036, "num_tokens": 14127183.0, "reward": 13.65631103515625, "reward_std": 0.9603863954544067, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7335498332977295, "rewards/length2tails_reward/std": 0.2952185273170471, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.522650718688965, "rewards/thermo_reward/std": 0.9086843132972717, "step": 1619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.90625, "completions/mean_terminated_length": 272.90625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08457469288259745, "epoch": 3.24, "frac_reward_zero_std": 0.0, "grad_norm": 0.0973481833934784, "learning_rate": 5.730608147062925e-07, "loss": 0.0027, "num_tokens": 14135948.0, "reward": 13.033241271972656, "reward_std": 4.528761386871338, "rewards/fitness_reward/mean": 6.997109889984131, "rewards/fitness_reward/std": 2.0595204830169678, "rewards/kidney_reward/mean": 2.438572645187378, "rewards/kidney_reward/std": 1.0629626512527466, "rewards/length2tails_reward/mean": 0.786112904548645, "rewards/length2tails_reward/std": 0.272849977016449, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.418948173522949, "rewards/thermo_reward/std": 1.452711582183838, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.25, "completions/mean_terminated_length": 273.25, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08236623834818602, "epoch": 3.242, "frac_reward_zero_std": 0.0, "grad_norm": 0.11871770024299622, "learning_rate": 5.719016229718747e-07, "loss": 0.0008, "num_tokens": 14144724.0, "reward": 13.879434585571289, "reward_std": 0.37527257204055786, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.823756217956543, "rewards/length2tails_reward/std": 0.23749664425849915, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 271.1875, "completions/mean_terminated_length": 271.1875, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "entropy": 0.09134671650826931, "epoch": 3.2439999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.1924533098936081, "learning_rate": 5.707431351376726e-07, "loss": 0.0038, "num_tokens": 14153434.0, "reward": 13.956525802612305, "reward_std": 0.22384217381477356, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7969448566436768, "rewards/length2tails_reward/std": 0.2788224220275879, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7891664505004883, "rewards/thermo_reward/std": 0.225629061460495, "step": 1622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.0625, "completions/mean_terminated_length": 271.0625, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.09193257614970207, "epoch": 3.246, "frac_reward_zero_std": 0.0, "grad_norm": 0.17157341539859772, "learning_rate": 5.695853531085286e-07, "loss": 0.0052, "num_tokens": 14162140.0, "reward": 13.958206176757812, "reward_std": 0.22330915927886963, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8137524127960205, "rewards/length2tails_reward/std": 0.21452507376670837, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7891664505004883, "rewards/thermo_reward/std": 0.225629061460495, "step": 1623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.6875, "completions/mean_terminated_length": 272.6875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.08646232448518276, "epoch": 3.248, "frac_reward_zero_std": 0.0, "grad_norm": 0.10585270076990128, "learning_rate": 5.684282787881247e-07, "loss": 0.0025, "num_tokens": 14170898.0, "reward": 13.920055389404297, "reward_std": 0.3132040500640869, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8310965299606323, "rewards/length2tails_reward/std": 0.1681250035762787, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.875, "completions/mean_terminated_length": 270.875, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "entropy": 0.08505373820662498, "epoch": 3.25, "frac_reward_zero_std": 0.0, "grad_norm": 0.08246946334838867, "learning_rate": 5.672719140789785e-07, "loss": 0.002, "num_tokens": 14179598.0, "reward": 13.871101379394531, "reward_std": 0.37816736102104187, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7404244542121887, "rewards/length2tails_reward/std": 0.3039153516292572, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.5625, "completions/mean_terminated_length": 270.5625, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.08091865479946136, "epoch": 3.252, "frac_reward_zero_std": 0.0, "grad_norm": 0.07720610499382019, "learning_rate": 5.661162608824419e-07, "loss": -0.0026, "num_tokens": 14188288.0, "reward": 13.658710479736328, "reward_std": 1.0409196615219116, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7231732606887817, "rewards/length2tails_reward/std": 0.3009643852710724, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.526088237762451, "rewards/thermo_reward/std": 0.8915759921073914, "step": 1626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.90625, "completions/mean_terminated_length": 272.90625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08896706625819206, "epoch": 3.254, "frac_reward_zero_std": 0.0, "grad_norm": 0.10086528956890106, "learning_rate": 5.649613210986953e-07, "loss": 0.0011, "num_tokens": 14197053.0, "reward": 13.81051254272461, "reward_std": 0.5248230695724487, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8069740533828735, "rewards/length2tails_reward/std": 0.24617759883403778, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.71875, "completions/mean_terminated_length": 272.71875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0814590915106237, "epoch": 3.2560000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.12942053377628326, "learning_rate": 5.638070966267479e-07, "loss": -0.0027, "num_tokens": 14205812.0, "reward": 13.797609329223633, "reward_std": 0.47521042823791504, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8032171726226807, "rewards/length2tails_reward/std": 0.24287335574626923, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.15625, "completions/mean_terminated_length": 273.15625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08510150481015444, "epoch": 3.258, "frac_reward_zero_std": 0.0, "grad_norm": 0.226546511054039, "learning_rate": 5.626535893644307e-07, "loss": 0.0052, "num_tokens": 14214585.0, "reward": 13.579366683959961, "reward_std": 1.3661936521530151, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7992387413978577, "rewards/length2tails_reward/std": 0.2424638569355011, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.524005651473999, "rewards/thermo_reward/std": 0.9019290804862976, "step": 1629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.21875, "completions/mean_terminated_length": 272.21875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09344392456114292, "epoch": 3.26, "frac_reward_zero_std": 0.0, "grad_norm": 0.1050717830657959, "learning_rate": 5.615008012083973e-07, "loss": -0.0008, "num_tokens": 14223328.0, "reward": 13.75810432434082, "reward_std": 0.5049390196800232, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8070328831672668, "rewards/length2tails_reward/std": 0.22400563955307007, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 1630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.5, "completions/mean_terminated_length": 270.5, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.09122634120285511, "epoch": 3.262, "frac_reward_zero_std": 0.0, "grad_norm": 0.1369512975215912, "learning_rate": 5.60348734054118e-07, "loss": 0.001, "num_tokens": 14232016.0, "reward": 13.58402156829834, "reward_std": 1.7690011262893677, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4892263412475586, "rewards/kidney_reward/std": 0.6357679963111877, "rewards/length2tails_reward/mean": 0.8118189573287964, "rewards/length2tails_reward/std": 0.25912174582481384, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5524284839630127, "rewards/thermo_reward/std": 1.1428624391555786, "step": 1631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 273.1875, "completions/mean_terminated_length": 273.1875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.08804465923458338, "epoch": 3.2640000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.17186035215854645, "learning_rate": 5.591973897958781e-07, "loss": 0.0017, "num_tokens": 14240790.0, "reward": 13.677754402160645, "reward_std": 0.9940446615219116, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.848839282989502, "rewards/length2tails_reward/std": 0.1896086186170578, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5052061080932617, "rewards/thermo_reward/std": 0.9968248605728149, "step": 1632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.90625, "completions/mean_terminated_length": 272.90625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09215592127293348, "epoch": 3.266, "frac_reward_zero_std": 0.0, "grad_norm": 0.17900444567203522, "learning_rate": 5.580467703267735e-07, "loss": -0.0005, "num_tokens": 14249555.0, "reward": 13.771299362182617, "reward_std": 0.553092896938324, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8137115240097046, "rewards/length2tails_reward/std": 0.2380651831626892, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.9375, "completions/mean_terminated_length": 272.9375, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.09035858232527971, "epoch": 3.268, "frac_reward_zero_std": 0.0, "grad_norm": 0.10600990802049637, "learning_rate": 5.568968775387088e-07, "loss": 0.0018, "num_tokens": 14258321.0, "reward": 13.639888763427734, "reward_std": 0.5818853974342346, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8214516043663025, "rewards/length2tails_reward/std": 0.1866902858018875, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.470078706741333, "rewards/thermo_reward/std": 0.5830413699150085, "step": 1634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.15625, "completions/mean_terminated_length": 271.15625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.08797851856797934, "epoch": 3.27, "frac_reward_zero_std": 0.0, "grad_norm": 0.1364179253578186, "learning_rate": 5.55747713322394e-07, "loss": 0.0075, "num_tokens": 14267030.0, "reward": 13.79644775390625, "reward_std": 0.4668932259082794, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7916043996810913, "rewards/length2tails_reward/std": 0.24688823521137238, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.75, "completions/mean_terminated_length": 272.75, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.07624836079776287, "epoch": 3.2720000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.07006789743900299, "learning_rate": 5.545992795673407e-07, "loss": -0.0048, "num_tokens": 14275790.0, "reward": 13.2792329788208, "reward_std": 2.7991321086883545, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.650642991065979, "rewards/kidney_reward/mean": 2.4431636333465576, "rewards/kidney_reward/std": 0.8907662630081177, "rewards/length2tails_reward/mean": 0.7920898199081421, "rewards/length2tails_reward/std": 0.27686822414398193, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.410693645477295, "rewards/thermo_reward/std": 1.2904707193374634, "step": 1636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.0, "completions/mean_terminated_length": 272.0, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09355740807950497, "epoch": 3.274, "frac_reward_zero_std": 0.0, "grad_norm": 0.10698295384645462, "learning_rate": 5.534515781618603e-07, "loss": 0.0051, "num_tokens": 14284526.0, "reward": 13.92752456665039, "reward_std": 0.37725645303726196, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7805237770080566, "rewards/length2tails_reward/std": 0.2552531957626343, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7891664505004883, "rewards/thermo_reward/std": 0.225629061460495, "step": 1637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.5, "completions/mean_terminated_length": 272.5, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08121457789093256, "epoch": 3.276, "frac_reward_zero_std": 0.0, "grad_norm": 0.10736272484064102, "learning_rate": 5.523046109930586e-07, "loss": -0.0001, "num_tokens": 14293278.0, "reward": 13.421783447265625, "reward_std": 2.360363721847534, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.484513759613037, "rewards/kidney_reward/std": 0.8030804395675659, "rewards/length2tails_reward/mean": 0.8002185821533203, "rewards/length2tails_reward/std": 0.24185405671596527, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.453571319580078, "rewards/thermo_reward/std": 1.2668566703796387, "step": 1638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.71875, "completions/mean_terminated_length": 272.71875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08090145420283079, "epoch": 3.278, "frac_reward_zero_std": 0.0, "grad_norm": 0.19558988511562347, "learning_rate": 5.511583799468351e-07, "loss": 0.005, "num_tokens": 14302037.0, "reward": 13.693817138671875, "reward_std": 1.0849432945251465, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8103063106536865, "rewards/length2tails_reward/std": 0.22535519301891327, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.552481174468994, "rewards/thermo_reward/std": 0.9444604516029358, "step": 1639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.3125, "completions/mean_terminated_length": 271.3125, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "entropy": 0.08695127349346876, "epoch": 3.2800000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.10942727327346802, "learning_rate": 5.500128869078788e-07, "loss": -0.0046, "num_tokens": 14310751.0, "reward": 13.836432456970215, "reward_std": 0.44083523750305176, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7925859689712524, "rewards/length2tails_reward/std": 0.23117829859256744, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 754.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 286.25, "completions/mean_terminated_length": 271.1612854003906, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09664624650031328, "epoch": 3.282, "frac_reward_zero_std": 0.0, "grad_norm": 0.7048646211624146, "learning_rate": 5.488681337596653e-07, "loss": -0.014, "num_tokens": 14319943.0, "reward": 12.315906524658203, "reward_std": 6.261025905609131, "rewards/fitness_reward/mean": 6.578765392303467, "rewards/fitness_reward/std": 3.084444046020508, "rewards/kidney_reward/mean": 2.2847700119018555, "rewards/kidney_reward/std": 1.4948978424072266, "rewards/length2tails_reward/mean": 0.711010754108429, "rewards/length2tails_reward/std": 0.2972946763038635, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2812695503234863, "rewards/thermo_reward/std": 1.7976088523864746, "step": 1641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.1875, "completions/mean_terminated_length": 270.1875, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "entropy": 0.08683570567518473, "epoch": 3.284, "frac_reward_zero_std": 0.0, "grad_norm": 0.09333089739084244, "learning_rate": 5.477241223844538e-07, "loss": -0.0044, "num_tokens": 14328621.0, "reward": 13.862743377685547, "reward_std": 0.38920146226882935, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.6568487882614136, "rewards/length2tails_reward/std": 0.3376544713973999, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 274.5, "completions/mean_terminated_length": 274.5, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.0855266572907567, "epoch": 3.286, "frac_reward_zero_std": 0.0, "grad_norm": 0.11528667062520981, "learning_rate": 5.465808546632829e-07, "loss": -0.0029, "num_tokens": 14337437.0, "reward": 13.918694496154785, "reward_std": 0.3151986300945282, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8174866437911987, "rewards/length2tails_reward/std": 0.18618960678577423, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "entropy": 0.08168186899274588, "epoch": 3.288, "frac_reward_zero_std": 0.0, "grad_norm": 0.10211341828107834, "learning_rate": 5.454383324759693e-07, "loss": 0.0062, "num_tokens": 14346153.0, "reward": 13.176010131835938, "reward_std": 3.7224204540252686, "rewards/fitness_reward/mean": 7.047780990600586, "rewards/fitness_reward/std": 1.772882342338562, "rewards/kidney_reward/mean": 2.4853415489196777, "rewards/kidney_reward/std": 0.7983972430229187, "rewards/length2tails_reward/mean": 0.7846165895462036, "rewards/length2tails_reward/std": 0.28638705611228943, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.464426040649414, "rewards/thermo_reward/std": 1.2092392444610596, "step": 1644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 272.1875, "completions/mean_terminated_length": 272.1875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.0881417142227292, "epoch": 3.29, "frac_reward_zero_std": 0.0, "grad_norm": 0.13756471872329712, "learning_rate": 5.442965577011038e-07, "loss": -0.0025, "num_tokens": 14354895.0, "reward": 13.877664566040039, "reward_std": 0.3820107877254486, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8060472011566162, "rewards/length2tails_reward/std": 0.2322707325220108, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.3125, "completions/mean_terminated_length": 273.3125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.0942190820351243, "epoch": 3.292, "frac_reward_zero_std": 0.0, "grad_norm": 0.10850630700588226, "learning_rate": 5.431555322160482e-07, "loss": 0.0034, "num_tokens": 14363673.0, "reward": 13.76106071472168, "reward_std": 0.5022327899932861, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8365955948829651, "rewards/length2tails_reward/std": 0.1620035618543625, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897367000579834, "rewards/thermo_reward/std": 0.5061467885971069, "step": 1646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.9375, "completions/mean_terminated_length": 272.9375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08697703760117292, "epoch": 3.294, "frac_reward_zero_std": 0.0, "grad_norm": 0.12967857718467712, "learning_rate": 5.420152578969325e-07, "loss": -0.0004, "num_tokens": 14372439.0, "reward": 13.880912780761719, "reward_std": 0.37692704796791077, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8385316133499146, "rewards/length2tails_reward/std": 0.18997938930988312, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.0625, "completions/mean_terminated_length": 273.0625, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.09083990287035704, "epoch": 3.296, "frac_reward_zero_std": 0.0, "grad_norm": 0.1563207358121872, "learning_rate": 5.408757366186507e-07, "loss": 0.0048, "num_tokens": 14381209.0, "reward": 13.851879119873047, "reward_std": 0.47710543870925903, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8217875957489014, "rewards/length2tails_reward/std": 0.19817610085010529, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08517451956868172, "epoch": 3.298, "frac_reward_zero_std": 0.0, "grad_norm": 0.2848761975765228, "learning_rate": 5.397369702548594e-07, "loss": 0.0025, "num_tokens": 14389957.0, "reward": 12.93878173828125, "reward_std": 4.8922038078308105, "rewards/fitness_reward/mean": 6.990458011627197, "rewards/fitness_reward/std": 2.09714937210083, "rewards/kidney_reward/mean": 2.399808406829834, "rewards/kidney_reward/std": 1.1330952644348145, "rewards/length2tails_reward/mean": 0.757195234298706, "rewards/length2tails_reward/std": 0.28747931122779846, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.372796058654785, "rewards/thermo_reward/std": 1.7039433717727661, "step": 1649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.71875, "completions/mean_terminated_length": 272.71875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0872359941713512, "epoch": 3.3, "frac_reward_zero_std": 0.0, "grad_norm": 0.11530021578073502, "learning_rate": 5.385989606779736e-07, "loss": 0.0056, "num_tokens": 14398716.0, "reward": 13.840316772460938, "reward_std": 0.42341527342796326, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8314379453659058, "rewards/length2tails_reward/std": 0.21843001246452332, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.9375, "completions/mean_terminated_length": 272.9375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08460887148976326, "epoch": 3.302, "frac_reward_zero_std": 0.0, "grad_norm": 0.1057276651263237, "learning_rate": 5.37461709759165e-07, "loss": -0.0009, "num_tokens": 14407482.0, "reward": 13.875188827514648, "reward_std": 0.38063186407089233, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7812926173210144, "rewards/length2tails_reward/std": 0.25195032358169556, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 272.0625, "completions/mean_terminated_length": 272.0625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08779798075556755, "epoch": 3.304, "frac_reward_zero_std": 0.0, "grad_norm": 0.11126738786697388, "learning_rate": 5.363252193683556e-07, "loss": -0.0042, "num_tokens": 14416220.0, "reward": 13.283806800842285, "reward_std": 1.834874153137207, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.650642991065979, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7586894035339355, "rewards/length2tails_reward/std": 0.3202492594718933, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.290010452270508, "rewards/thermo_reward/std": 1.2315627336502075, "step": 1652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.21875, "completions/mean_terminated_length": 272.21875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08191088866442442, "epoch": 3.306, "frac_reward_zero_std": 0.0, "grad_norm": 0.13026760518550873, "learning_rate": 5.351894913742192e-07, "loss": 0.0005, "num_tokens": 14424963.0, "reward": 13.807825088500977, "reward_std": 0.8251524567604065, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.653838634490967, "rewards/kidney_reward/std": 0.15476679801940918, "rewards/length2tails_reward/mean": 0.7754607200622559, "rewards/length2tails_reward/std": 0.2539330720901489, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.615255355834961, "rewards/thermo_reward/std": 0.8070117831230164, "step": 1653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 272.78125, "completions/mean_terminated_length": 272.78125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08574399258941412, "epoch": 3.308, "frac_reward_zero_std": 0.0, "grad_norm": 0.1388465017080307, "learning_rate": 5.340545276441754e-07, "loss": 0.0011, "num_tokens": 14433724.0, "reward": 13.835010528564453, "reward_std": 0.42451682686805725, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7783781290054321, "rewards/length2tails_reward/std": 0.2693405747413635, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.21875, "completions/mean_terminated_length": 271.21875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.08541551604866982, "epoch": 3.31, "frac_reward_zero_std": 0.0, "grad_norm": 0.5051350593566895, "learning_rate": 5.32920330044386e-07, "loss": -0.0118, "num_tokens": 14442435.0, "reward": 13.449106216430664, "reward_std": 2.082252025604248, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.4897396564483643, "rewards/kidney_reward/std": 0.6329513788223267, "rewards/length2tails_reward/mean": 0.825819730758667, "rewards/length2tails_reward/std": 0.2356375902891159, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4731087684631348, "rewards/thermo_reward/std": 1.1634302139282227, "step": 1655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.0625, "completions/mean_terminated_length": 272.0625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08114295545965433, "epoch": 3.312, "frac_reward_zero_std": 0.0, "grad_norm": 0.06512371450662613, "learning_rate": 5.317869004397544e-07, "loss": 0.0021, "num_tokens": 14451173.0, "reward": 13.90968132019043, "reward_std": 0.3100299835205078, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7273616790771484, "rewards/length2tails_reward/std": 0.3085137903690338, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.0625, "completions/mean_terminated_length": 273.0625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08665010053664446, "epoch": 3.314, "frac_reward_zero_std": 0.0, "grad_norm": 0.11714247614145279, "learning_rate": 5.306542406939206e-07, "loss": 0.0042, "num_tokens": 14459943.0, "reward": 13.71487045288086, "reward_std": 0.5333173871040344, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7735536098480225, "rewards/length2tails_reward/std": 0.306517094373703, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5498504638671875, "rewards/thermo_reward/std": 0.5360844731330872, "step": 1657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 271.65625, "completions/mean_terminated_length": 271.65625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08406441286206245, "epoch": 3.316, "frac_reward_zero_std": 0.0, "grad_norm": 0.10046909004449844, "learning_rate": 5.295223526692593e-07, "loss": -0.0017, "num_tokens": 14468668.0, "reward": 13.609403610229492, "reward_std": 0.9153671860694885, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7325645089149475, "rewards/length2tails_reward/std": 0.27868160605430603, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4484822750091553, "rewards/thermo_reward/std": 0.9082307815551758, "step": 1658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.125, "completions/mean_terminated_length": 273.125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09391424618661404, "epoch": 3.318, "frac_reward_zero_std": 0.0, "grad_norm": 0.07182227820158005, "learning_rate": 5.283912382268762e-07, "loss": -0.0013, "num_tokens": 14477440.0, "reward": 13.687997817993164, "reward_std": 0.927980363368988, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8127250671386719, "rewards/length2tails_reward/std": 0.25279492139816284, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5190603733062744, "rewards/thermo_reward/std": 0.9266504049301147, "step": 1659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.59375, "completions/mean_terminated_length": 272.59375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0778304161503911, "epoch": 3.32, "frac_reward_zero_std": 0.0, "grad_norm": 0.07771207392215729, "learning_rate": 5.272608992266039e-07, "loss": -0.0017, "num_tokens": 14486195.0, "reward": 13.879322052001953, "reward_std": 0.38042446970939636, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8226305842399597, "rewards/length2tails_reward/std": 0.21475577354431152, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.21875, "completions/mean_terminated_length": 272.21875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08480161521583796, "epoch": 3.322, "frac_reward_zero_std": 0.0, "grad_norm": 0.09371089190244675, "learning_rate": 5.261313375270013e-07, "loss": -0.0041, "num_tokens": 14494938.0, "reward": 13.952810287475586, "reward_std": 0.23375768959522247, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7597858905792236, "rewards/length2tails_reward/std": 0.2678978443145752, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7891664505004883, "rewards/thermo_reward/std": 0.225629061460495, "step": 1661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.90625, "completions/mean_terminated_length": 271.90625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09179816581308842, "epoch": 3.324, "frac_reward_zero_std": 0.0, "grad_norm": 0.14725930988788605, "learning_rate": 5.250025549853491e-07, "loss": 0.0017, "num_tokens": 14503671.0, "reward": 13.831579208374023, "reward_std": 0.4285857081413269, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7440521717071533, "rewards/length2tails_reward/std": 0.28048190474510193, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.53125, "completions/mean_terminated_length": 271.53125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08306058449670672, "epoch": 3.326, "frac_reward_zero_std": 0.0, "grad_norm": 0.08595037460327148, "learning_rate": 5.238745534576461e-07, "loss": -0.0046, "num_tokens": 14512392.0, "reward": 12.498080253601074, "reward_std": 4.544852256774902, "rewards/fitness_reward/mean": 6.621407985687256, "rewards/fitness_reward/std": 2.7026584148406982, "rewards/kidney_reward/mean": 2.4149489402770996, "rewards/kidney_reward/std": 0.7349871397018433, "rewards/length2tails_reward/mean": 0.7154628038406372, "rewards/length2tails_reward/std": 0.32782670855522156, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.290177345275879, "rewards/thermo_reward/std": 1.4791014194488525, "step": 1663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.46875, "completions/mean_terminated_length": 272.46875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.0952264815568924, "epoch": 3.328, "frac_reward_zero_std": 0.0, "grad_norm": 0.09594647586345673, "learning_rate": 5.227473347986082e-07, "loss": 0.0028, "num_tokens": 14521143.0, "reward": 13.879316329956055, "reward_std": 0.37391072511672974, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8225646018981934, "rewards/length2tails_reward/std": 0.2126975804567337, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.84375, "completions/mean_terminated_length": 271.84375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.07835912797600031, "epoch": 3.33, "frac_reward_zero_std": 0.0, "grad_norm": 0.13977433741092682, "learning_rate": 5.216209008616621e-07, "loss": -0.0014, "num_tokens": 14529874.0, "reward": 13.880647659301758, "reward_std": 0.4383620321750641, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7106102705001831, "rewards/length2tails_reward/std": 0.31361716985702515, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.875, "completions/mean_terminated_length": 271.875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "entropy": 0.08603645162656903, "epoch": 3.332, "frac_reward_zero_std": 0.0, "grad_norm": 0.155784472823143, "learning_rate": 5.204952534989462e-07, "loss": -0.0121, "num_tokens": 14538606.0, "reward": 13.785726547241211, "reward_std": 0.8251009583473206, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8525975346565247, "rewards/length2tails_reward/std": 0.17560173571109772, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.612802505493164, "rewards/thermo_reward/std": 0.8198140859603882, "step": 1666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 270.65625, "completions/mean_terminated_length": 270.65625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.09190623741596937, "epoch": 3.334, "frac_reward_zero_std": 0.0, "grad_norm": 0.1395697444677353, "learning_rate": 5.193703945613043e-07, "loss": 0.0044, "num_tokens": 14547299.0, "reward": 13.883354187011719, "reward_std": 0.37622812390327454, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8629554510116577, "rewards/length2tails_reward/std": 0.20908312499523163, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.5625, "completions/mean_terminated_length": 272.5625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08574157673865557, "epoch": 3.336, "frac_reward_zero_std": 0.0, "grad_norm": 0.11695639044046402, "learning_rate": 5.182463258982846e-07, "loss": 0.0033, "num_tokens": 14556053.0, "reward": 13.877433776855469, "reward_std": 0.3750380575656891, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8037418127059937, "rewards/length2tails_reward/std": 0.21794305741786957, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.8125, "completions/mean_terminated_length": 271.8125, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 0.08719206042587757, "epoch": 3.338, "frac_reward_zero_std": 0.0, "grad_norm": 0.16675855219364166, "learning_rate": 5.171230493581358e-07, "loss": -0.0018, "num_tokens": 14564783.0, "reward": 13.399309158325195, "reward_std": 2.053853988647461, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4316983222961426, "rewards/kidney_reward/std": 0.7683201432228088, "rewards/length2tails_reward/mean": 0.8151653409004211, "rewards/length2tails_reward/std": 0.2573068141937256, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4249095916748047, "rewards/thermo_reward/std": 1.2939376831054688, "step": 1669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 282.625, "completions/mean_terminated_length": 282.625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.10062659624963999, "epoch": 3.34, "frac_reward_zero_std": 0.0, "grad_norm": 0.47647494077682495, "learning_rate": 5.160005667878033e-07, "loss": 0.0455, "num_tokens": 14573859.0, "reward": 13.183435440063477, "reward_std": 3.1175551414489746, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.511730670928955, "rewards/kidney_reward/std": 0.5132253766059875, "rewards/length2tails_reward/mean": 0.8262677192687988, "rewards/length2tails_reward/std": 0.2650676369667053, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4360239505767822, "rewards/thermo_reward/std": 1.1104899644851685, "step": 1670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.5625, "completions/mean_terminated_length": 273.5625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08673338778316975, "epoch": 3.342, "frac_reward_zero_std": 0.0, "grad_norm": 0.1299479454755783, "learning_rate": 5.148788800329277e-07, "loss": 0.0016, "num_tokens": 14582645.0, "reward": 13.880162239074707, "reward_std": 0.374368280172348, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8310284614562988, "rewards/length2tails_reward/std": 0.24202223122119904, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 0.09135918691754341, "epoch": 3.344, "frac_reward_zero_std": 0.0, "grad_norm": 0.08872079104185104, "learning_rate": 5.137579909378417e-07, "loss": -0.004, "num_tokens": 14591361.0, "reward": 13.507393836975098, "reward_std": 1.6749547719955444, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.536761999130249, "rewards/kidney_reward/std": 0.5075190663337708, "rewards/length2tails_reward/mean": 0.8185112476348877, "rewards/length2tails_reward/std": 0.23466837406158447, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.427595615386963, "rewards/thermo_reward/std": 1.2838958501815796, "step": 1672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.34375, "completions/mean_terminated_length": 273.34375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.085330362431705, "epoch": 3.346, "frac_reward_zero_std": 0.0, "grad_norm": 0.09901421517133713, "learning_rate": 5.126379013455655e-07, "loss": 0.0042, "num_tokens": 14600140.0, "reward": 13.88277816772461, "reward_std": 0.3741099238395691, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8571890592575073, "rewards/length2tails_reward/std": 0.15984788537025452, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.5, "completions/mean_terminated_length": 273.5, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08711870899423957, "epoch": 3.348, "frac_reward_zero_std": 0.0, "grad_norm": 0.09154262393712997, "learning_rate": 5.115186130978046e-07, "loss": 0.0012, "num_tokens": 14608924.0, "reward": 13.653093338012695, "reward_std": 0.6277295351028442, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8282286524772644, "rewards/length2tails_reward/std": 0.24916499853134155, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.50996470451355, "rewards/thermo_reward/std": 0.5615194439888, "step": 1674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.5, "completions/mean_terminated_length": 272.5, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0861954502761364, "epoch": 3.35, "frac_reward_zero_std": 0.0, "grad_norm": 0.1118447333574295, "learning_rate": 5.104001280349479e-07, "loss": 0.0055, "num_tokens": 14617676.0, "reward": 13.753969192504883, "reward_std": 0.4972566068172455, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.765673816204071, "rewards/length2tails_reward/std": 0.31640881299972534, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 1675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 271.5625, "completions/mean_terminated_length": 271.5625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08270327001810074, "epoch": 3.352, "frac_reward_zero_std": 0.0, "grad_norm": 0.13411538302898407, "learning_rate": 5.092824479960625e-07, "loss": -0.0039, "num_tokens": 14626398.0, "reward": 13.68724250793457, "reward_std": 1.2245506048202515, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.77997887134552, "rewards/length2tails_reward/std": 0.2146291881799698, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.521580696105957, "rewards/thermo_reward/std": 1.2119030952453613, "step": 1676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.375, "completions/mean_terminated_length": 270.375, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "entropy": 0.081757552921772, "epoch": 3.354, "frac_reward_zero_std": 0.0, "grad_norm": 0.08777818828821182, "learning_rate": 5.081655748188923e-07, "loss": 0.0, "num_tokens": 14635082.0, "reward": 13.873549461364746, "reward_std": 0.3831973671913147, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7649028301239014, "rewards/length2tails_reward/std": 0.27656090259552, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.6875, "completions/mean_terminated_length": 273.6875, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "entropy": 0.09073897916823626, "epoch": 3.356, "frac_reward_zero_std": 0.0, "grad_norm": 0.10093200951814651, "learning_rate": 5.070495103398551e-07, "loss": 0.0035, "num_tokens": 14643872.0, "reward": 13.886987686157227, "reward_std": 0.375377357006073, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8992826342582703, "rewards/length2tails_reward/std": 0.15805500745773315, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 271.8125, "completions/mean_terminated_length": 271.8125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09156694356352091, "epoch": 3.358, "frac_reward_zero_std": 0.0, "grad_norm": 0.08219363540410995, "learning_rate": 5.059342563940383e-07, "loss": 0.001, "num_tokens": 14652602.0, "reward": 13.767139434814453, "reward_std": 0.5540488958358765, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7721138000488281, "rewards/length2tails_reward/std": 0.23606324195861816, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 274.6875, "completions/mean_terminated_length": 274.6875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.08526533050462604, "epoch": 3.36, "frac_reward_zero_std": 0.0, "grad_norm": 0.09103113412857056, "learning_rate": 5.048198148151968e-07, "loss": 0.0024, "num_tokens": 14661424.0, "reward": 13.373743057250977, "reward_std": 1.516675591468811, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5116631984710693, "rewards/kidney_reward/std": 0.5135889053344727, "rewards/length2tails_reward/mean": 0.8729467988014221, "rewards/length2tails_reward/std": 0.22229276597499847, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3136000633239746, "rewards/thermo_reward/std": 1.1403894424438477, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.6875, "completions/mean_terminated_length": 271.6875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08375388663262129, "epoch": 3.362, "frac_reward_zero_std": 0.0, "grad_norm": 0.0875633955001831, "learning_rate": 5.037061874357502e-07, "loss": -0.0035, "num_tokens": 14670150.0, "reward": 13.83285140991211, "reward_std": 0.4310167133808136, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.756779670715332, "rewards/length2tails_reward/std": 0.2828678786754608, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.0, "completions/mean_terminated_length": 272.0, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08747315965592861, "epoch": 3.364, "frac_reward_zero_std": 0.0, "grad_norm": 0.12928470969200134, "learning_rate": 5.025933760867781e-07, "loss": 0.0077, "num_tokens": 14678886.0, "reward": 13.807638168334961, "reward_std": 0.5141726732254028, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7782381772994995, "rewards/length2tails_reward/std": 0.2559855580329895, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.59375, "completions/mean_terminated_length": 272.59375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08404806908220053, "epoch": 3.366, "frac_reward_zero_std": 0.0, "grad_norm": 0.13107116520404816, "learning_rate": 5.014813825980196e-07, "loss": -0.0002, "num_tokens": 14687641.0, "reward": 13.807857513427734, "reward_std": 0.5213117003440857, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7804368734359741, "rewards/length2tails_reward/std": 0.2552006244659424, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.78125, "completions/mean_terminated_length": 271.78125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08485516533255577, "epoch": 3.368, "frac_reward_zero_std": 0.0, "grad_norm": 0.18920296430587769, "learning_rate": 5.003702087978685e-07, "loss": 0.0048, "num_tokens": 14696370.0, "reward": 13.562747955322266, "reward_std": 1.0341124534606934, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7378770112991333, "rewards/length2tails_reward/std": 0.30395567417144775, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.456014394760132, "rewards/thermo_reward/std": 0.8731143474578857, "step": 1684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.78125, "completions/mean_terminated_length": 271.78125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08026226470246911, "epoch": 3.37, "frac_reward_zero_std": 0.0, "grad_norm": 0.07742941379547119, "learning_rate": 4.992598565133709e-07, "loss": -0.0014, "num_tokens": 14705099.0, "reward": 13.728206634521484, "reward_std": 0.8616410493850708, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.709161102771759, "rewards/length2tails_reward/std": 0.32603222131729126, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5696256160736084, "rewards/thermo_reward/std": 0.8564317226409912, "step": 1685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.21875, "completions/mean_terminated_length": 273.21875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08489504549652338, "epoch": 3.372, "frac_reward_zero_std": 0.0, "grad_norm": 0.09348171204328537, "learning_rate": 4.981503275702227e-07, "loss": 0.0057, "num_tokens": 14713874.0, "reward": 13.881086349487305, "reward_std": 0.3735075891017914, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8402734398841858, "rewards/length2tails_reward/std": 0.2117583304643631, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.6875, "completions/mean_terminated_length": 273.6875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08448885660618544, "epoch": 3.374, "frac_reward_zero_std": 0.0, "grad_norm": 0.12279488891363144, "learning_rate": 4.970416237927645e-07, "loss": 0.0055, "num_tokens": 14722664.0, "reward": 13.677044868469238, "reward_std": 0.9966065883636475, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8499288558959961, "rewards/length2tails_reward/std": 0.23315902054309845, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5317471027374268, "rewards/thermo_reward/std": 0.8636287450790405, "step": 1687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.53125, "completions/mean_terminated_length": 273.53125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08651688508689404, "epoch": 3.376, "frac_reward_zero_std": 0.0, "grad_norm": 0.1983906626701355, "learning_rate": 4.959337470039815e-07, "loss": 0.0057, "num_tokens": 14731449.0, "reward": 13.744522094726562, "reward_std": 0.6199595332145691, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8195300102233887, "rewards/length2tails_reward/std": 0.22915887832641602, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.53125, "completions/mean_terminated_length": 270.53125, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "entropy": 0.08888929057866335, "epoch": 3.378, "frac_reward_zero_std": 0.0, "grad_norm": 0.12954142689704895, "learning_rate": 4.948266990254988e-07, "loss": -0.0002, "num_tokens": 14740138.0, "reward": 13.636672973632812, "reward_std": 1.7633979320526123, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5204615592956543, "rewards/kidney_reward/std": 0.599727988243103, "rewards/length2tails_reward/mean": 0.7028523683547974, "rewards/length2tails_reward/std": 0.2704508304595947, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.58474063873291, "rewards/thermo_reward/std": 1.1710478067398071, "step": 1689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.375, "completions/mean_terminated_length": 273.375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08910947199910879, "epoch": 3.38, "frac_reward_zero_std": 0.0, "grad_norm": 0.13247615098953247, "learning_rate": 4.937204816775787e-07, "loss": -0.0025, "num_tokens": 14748918.0, "reward": 13.64747428894043, "reward_std": 1.036157488822937, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8059340119361877, "rewards/length2tails_reward/std": 0.2467590719461441, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.506575345993042, "rewards/thermo_reward/std": 0.9898343086242676, "step": 1690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 282.65625, "completions/mean_terminated_length": 282.65625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0923977354541421, "epoch": 3.382, "frac_reward_zero_std": 0.0, "grad_norm": 0.16892749071121216, "learning_rate": 4.926150967791179e-07, "loss": -0.0091, "num_tokens": 14757995.0, "reward": 13.95375919342041, "reward_std": 0.22327548265457153, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7692772746086121, "rewards/length2tails_reward/std": 0.27421385049819946, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7891664505004883, "rewards/thermo_reward/std": 0.225629061460495, "step": 1691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.90625, "completions/mean_terminated_length": 272.90625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08730402961373329, "epoch": 3.384, "frac_reward_zero_std": 0.0, "grad_norm": 0.0977780744433403, "learning_rate": 4.915105461476435e-07, "loss": 0.0032, "num_tokens": 14766760.0, "reward": 13.720968246459961, "reward_std": 0.5337844491004944, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8345231413841248, "rewards/length2tails_reward/std": 0.20670267939567566, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5498504638671875, "rewards/thermo_reward/std": 0.5360844731330872, "step": 1692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.0625, "completions/mean_terminated_length": 272.0625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08574420027434826, "epoch": 3.386, "frac_reward_zero_std": 0.0, "grad_norm": 0.09038598090410233, "learning_rate": 4.904068315993117e-07, "loss": 0.0042, "num_tokens": 14775498.0, "reward": 13.764984130859375, "reward_std": 0.5463270545005798, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7505573630332947, "rewards/length2tails_reward/std": 0.256422758102417, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.78125, "completions/mean_terminated_length": 272.78125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08899992611259222, "epoch": 3.388, "frac_reward_zero_std": 0.0, "grad_norm": 0.1362181156873703, "learning_rate": 4.893039549489039e-07, "loss": 0.0017, "num_tokens": 14784259.0, "reward": 13.048135757446289, "reward_std": 2.5231218338012695, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.452592670917511, "rewards/kidney_reward/mean": 2.468844175338745, "rewards/kidney_reward/std": 0.6112906336784363, "rewards/length2tails_reward/mean": 0.8081793785095215, "rewards/length2tails_reward/std": 0.22916291654109955, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.152308225631714, "rewards/thermo_reward/std": 1.6434428691864014, "step": 1694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.07493429351598024, "epoch": 3.39, "frac_reward_zero_std": 0.0, "grad_norm": 0.0555286630988121, "learning_rate": 4.882019180098236e-07, "loss": -0.0042, "num_tokens": 14793003.0, "reward": 13.409624099731445, "reward_std": 2.6447482109069824, "rewards/fitness_reward/mean": 7.027220726013184, "rewards/fitness_reward/std": 1.8891884088516235, "rewards/kidney_reward/mean": 2.536163330078125, "rewards/kidney_reward/std": 0.510905921459198, "rewards/length2tails_reward/mean": 0.7673140168190002, "rewards/length2tails_reward/std": 0.28843799233436584, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 274.21875, "completions/mean_terminated_length": 274.21875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.0757355852983892, "epoch": 3.392, "frac_reward_zero_std": 0.0, "grad_norm": 0.15180395543575287, "learning_rate": 4.871007225940939e-07, "loss": 0.006, "num_tokens": 14801810.0, "reward": 13.478560447692871, "reward_std": 1.304787278175354, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8556383848190308, "rewards/length2tails_reward/std": 0.20991253852844238, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3600502014160156, "rewards/thermo_reward/std": 1.1117326021194458, "step": 1696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.15625, "completions/mean_terminated_length": 272.15625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0866955453529954, "epoch": 3.394, "frac_reward_zero_std": 0.0, "grad_norm": 0.11245302855968475, "learning_rate": 4.860003705123538e-07, "loss": 0.009, "num_tokens": 14810551.0, "reward": 13.753739356994629, "reward_std": 0.4965442717075348, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.76337730884552, "rewards/length2tails_reward/std": 0.26430433988571167, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897367000579834, "rewards/thermo_reward/std": 0.5061467885971069, "step": 1697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.46875, "completions/mean_terminated_length": 272.46875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08178166439756751, "epoch": 3.396, "frac_reward_zero_std": 0.0, "grad_norm": 0.11046211421489716, "learning_rate": 4.849008635738553e-07, "loss": -0.0003, "num_tokens": 14819302.0, "reward": 13.504169464111328, "reward_std": 2.3498494625091553, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.4508585929870605, "rewards/kidney_reward/std": 0.9934619069099426, "rewards/length2tails_reward/mean": 0.8170171976089478, "rewards/length2tails_reward/std": 0.21700961887836456, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5104241371154785, "rewards/thermo_reward/std": 1.3729063272476196, "step": 1698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 273.9375, "completions/mean_terminated_length": 273.9375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09255535062402487, "epoch": 3.398, "frac_reward_zero_std": 0.0, "grad_norm": 0.11002900451421738, "learning_rate": 4.838022035864618e-07, "loss": -0.0098, "num_tokens": 14828100.0, "reward": 13.837339401245117, "reward_std": 0.4396858513355255, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.801662802696228, "rewards/length2tails_reward/std": 0.2261502742767334, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.96875, "completions/mean_terminated_length": 272.96875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.07731029391288757, "epoch": 3.4, "frac_reward_zero_std": 0.0, "grad_norm": 0.09554555267095566, "learning_rate": 4.827043923566434e-07, "loss": -0.0059, "num_tokens": 14836867.0, "reward": 13.464859008789062, "reward_std": 1.9004313945770264, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.528146982192993, "rewards/kidney_reward/std": 0.5562539100646973, "rewards/length2tails_reward/mean": 0.803817868232727, "rewards/length2tails_reward/std": 0.24521175026893616, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.395145893096924, "rewards/thermo_reward/std": 1.4224570989608765, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.5625, "completions/mean_terminated_length": 272.5625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08424821449443698, "epoch": 3.402, "frac_reward_zero_std": 0.0, "grad_norm": 0.1601286232471466, "learning_rate": 4.816074316894749e-07, "loss": 0.0006, "num_tokens": 14845621.0, "reward": 13.775213241577148, "reward_std": 0.8285742998123169, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7672507166862488, "rewards/length2tails_reward/std": 0.2551310062408447, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.610823631286621, "rewards/thermo_reward/std": 0.8301683068275452, "step": 1701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.28125, "completions/mean_terminated_length": 272.28125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.07866405416280031, "epoch": 3.404, "frac_reward_zero_std": 0.0, "grad_norm": 0.11888407170772552, "learning_rate": 4.805113233886331e-07, "loss": 0.0013, "num_tokens": 14854366.0, "reward": 13.874348640441895, "reward_std": 0.37398761510849, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7728902101516724, "rewards/length2tails_reward/std": 0.2928953468799591, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.25, "completions/mean_terminated_length": 270.25, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.08661049697548151, "epoch": 3.406, "frac_reward_zero_std": 0.0, "grad_norm": 0.18512208759784698, "learning_rate": 4.794160692563917e-07, "loss": -0.0434, "num_tokens": 14863046.0, "reward": 13.088939666748047, "reward_std": 4.899957656860352, "rewards/fitness_reward/mean": 6.978641986846924, "rewards/fitness_reward/std": 2.1639907360076904, "rewards/kidney_reward/mean": 2.4049041271209717, "rewards/kidney_reward/std": 1.2534205913543701, "rewards/length2tails_reward/mean": 0.7930488586425781, "rewards/length2tails_reward/std": 0.23190155625343323, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5260891914367676, "rewards/thermo_reward/std": 1.4979829788208008, "step": 1703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 272.34375, "completions/mean_terminated_length": 272.34375, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "entropy": 0.08553109876811504, "epoch": 3.408, "frac_reward_zero_std": 0.0, "grad_norm": 0.10315986722707748, "learning_rate": 4.783216710936212e-07, "loss": 0.0052, "num_tokens": 14871793.0, "reward": 13.883841514587402, "reward_std": 0.374860554933548, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8678221106529236, "rewards/length2tails_reward/std": 0.1376638561487198, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.71875, "completions/mean_terminated_length": 271.71875, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.07992964796721935, "epoch": 3.41, "frac_reward_zero_std": 0.0, "grad_norm": 0.10810313373804092, "learning_rate": 4.772281306997848e-07, "loss": 0.0039, "num_tokens": 14880520.0, "reward": 13.925355911254883, "reward_std": 0.312097430229187, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8841127157211304, "rewards/length2tails_reward/std": 0.12169551104307175, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 754.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 286.59375, "completions/mean_terminated_length": 271.51611328125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09155255556106567, "epoch": 3.412, "frac_reward_zero_std": 0.0, "grad_norm": 0.24644652009010315, "learning_rate": 4.761354498729344e-07, "loss": -0.0228, "num_tokens": 14889723.0, "reward": 13.73485279083252, "reward_std": 0.8732155561447144, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7760403156280518, "rewards/length2tails_reward/std": 0.25661709904670715, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5695838928222656, "rewards/thermo_reward/std": 0.85664302110672, "step": 1706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.59375, "completions/mean_terminated_length": 269.59375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.08220897056162357, "epoch": 3.414, "frac_reward_zero_std": 0.0, "grad_norm": 0.1083146333694458, "learning_rate": 4.7504363040970987e-07, "loss": -0.0039, "num_tokens": 14898382.0, "reward": 13.708334922790527, "reward_std": 0.8958057165145874, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.770789384841919, "rewards/length2tails_reward/std": 0.2582211494445801, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.570950508117676, "rewards/thermo_reward/std": 0.8497097492218018, "step": 1707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.5625, "completions/mean_terminated_length": 271.5625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08558735437691212, "epoch": 3.416, "frac_reward_zero_std": 0.0, "grad_norm": 0.08821485936641693, "learning_rate": 4.7395267410533304e-07, "loss": 0.0004, "num_tokens": 14907104.0, "reward": 13.673770904541016, "reward_std": 0.954501211643219, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7165445685386658, "rewards/length2tails_reward/std": 0.25880908966064453, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.514451503753662, "rewards/thermo_reward/std": 0.949852705001831, "step": 1708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.03125, "completions/mean_terminated_length": 272.03125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.07648206362500787, "epoch": 3.418, "frac_reward_zero_std": 0.0, "grad_norm": 0.13279765844345093, "learning_rate": 4.728625827536079e-07, "loss": 0.0041, "num_tokens": 14915841.0, "reward": 13.487560272216797, "reward_std": 2.0662591457366943, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.486978769302368, "rewards/kidney_reward/std": 0.6481077075004578, "rewards/length2tails_reward/mean": 0.7673639059066772, "rewards/length2tails_reward/std": 0.23985138535499573, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5201685428619385, "rewards/thermo_reward/std": 1.2155030965805054, "step": 1709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 280.78125, "completions/mean_terminated_length": 280.78125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08176046423614025, "epoch": 3.42, "frac_reward_zero_std": 0.0, "grad_norm": 0.13497784733772278, "learning_rate": 4.7177335814691564e-07, "loss": -0.0096, "num_tokens": 14924858.0, "reward": 13.632104873657227, "reward_std": 1.6099051237106323, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5389091968536377, "rewards/kidney_reward/std": 0.4953727722167969, "rewards/length2tails_reward/mean": 0.7525918483734131, "rewards/length2tails_reward/std": 0.2809031903743744, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5567517280578613, "rewards/thermo_reward/std": 1.1193660497665405, "step": 1710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.25, "completions/mean_terminated_length": 273.25, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08430183865129948, "epoch": 3.422, "frac_reward_zero_std": 0.0, "grad_norm": 0.10553085058927536, "learning_rate": 4.7068500207621255e-07, "loss": 0.0019, "num_tokens": 14933634.0, "reward": 13.798028945922852, "reward_std": 0.46553653478622437, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8074179887771606, "rewards/length2tails_reward/std": 0.2968822121620178, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.78125, "completions/mean_terminated_length": 272.78125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08506747148931026, "epoch": 3.424, "frac_reward_zero_std": 0.0, "grad_norm": 0.118733249604702, "learning_rate": 4.6959751633102673e-07, "loss": 0.0003, "num_tokens": 14942395.0, "reward": 13.716915130615234, "reward_std": 0.5371593832969666, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.79399573802948, "rewards/length2tails_reward/std": 0.2721828520298004, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5498507022857666, "rewards/thermo_reward/std": 0.5360844731330872, "step": 1712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.3125, "completions/mean_terminated_length": 273.3125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08996152225881815, "epoch": 3.426, "frac_reward_zero_std": 0.0, "grad_norm": 0.10672859847545624, "learning_rate": 4.685109026994556e-07, "loss": 0.0026, "num_tokens": 14951173.0, "reward": 13.839287757873535, "reward_std": 0.43059033155441284, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8211460113525391, "rewards/length2tails_reward/std": 0.24959631264209747, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.4375, "completions/mean_terminated_length": 272.4375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.0840189615264535, "epoch": 3.428, "frac_reward_zero_std": 0.0, "grad_norm": 0.09118340164422989, "learning_rate": 4.674251629681615e-07, "loss": 0.0038, "num_tokens": 14959923.0, "reward": 13.689261436462402, "reward_std": 0.6019656658172607, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.791059136390686, "rewards/length2tails_reward/std": 0.24709247052669525, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5498507022857666, "rewards/thermo_reward/std": 0.5360844731330872, "step": 1714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 273.65625, "completions/mean_terminated_length": 273.65625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08755708485841751, "epoch": 3.43, "frac_reward_zero_std": 0.0, "grad_norm": 0.11407990753650665, "learning_rate": 4.663402989223709e-07, "loss": -0.0055, "num_tokens": 14968712.0, "reward": 13.876548767089844, "reward_std": 0.38363367319107056, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7948861718177795, "rewards/length2tails_reward/std": 0.21179290115833282, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.25, "completions/mean_terminated_length": 271.25, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "entropy": 0.08625697903335094, "epoch": 3.432, "frac_reward_zero_std": 0.0, "grad_norm": 0.07827986031770706, "learning_rate": 4.652563123458703e-07, "loss": -0.0005, "num_tokens": 14977424.0, "reward": 13.826070785522461, "reward_std": 0.944733202457428, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7625695466995239, "rewards/length2tails_reward/std": 0.26302239298820496, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.6895079612731934, "rewards/thermo_reward/std": 0.7893825769424438, "step": 1716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08533341623842716, "epoch": 3.434, "frac_reward_zero_std": 0.0, "grad_norm": 0.16518720984458923, "learning_rate": 4.641732050210031e-07, "loss": 0.0011, "num_tokens": 14986168.0, "reward": 12.971258163452148, "reward_std": 4.007254600524902, "rewards/fitness_reward/mean": 7.052707672119141, "rewards/fitness_reward/std": 1.745011329650879, "rewards/kidney_reward/mean": 2.399848461151123, "rewards/kidney_reward/std": 0.9084376096725464, "rewards/length2tails_reward/mean": 0.7630383968353271, "rewards/length2tails_reward/std": 0.2822224199771881, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.342397928237915, "rewards/thermo_reward/std": 1.6145765781402588, "step": 1717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.78125, "completions/mean_terminated_length": 271.78125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08906451426446438, "epoch": 3.436, "frac_reward_zero_std": 0.0, "grad_norm": 0.11727608740329742, "learning_rate": 4.6309097872866766e-07, "loss": -0.0033, "num_tokens": 14994897.0, "reward": 13.910297393798828, "reward_std": 0.31524109840393066, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7335153222084045, "rewards/length2tails_reward/std": 0.2708199620246887, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.34375, "completions/mean_terminated_length": 273.34375, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.0859810272231698, "epoch": 3.438, "frac_reward_zero_std": 0.0, "grad_norm": 0.08078237622976303, "learning_rate": 4.6200963524831284e-07, "loss": 0.0031, "num_tokens": 15003676.0, "reward": 13.933954238891602, "reward_std": 0.3779996335506439, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.844823956489563, "rewards/length2tails_reward/std": 0.1683105230331421, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7891664505004883, "rewards/thermo_reward/std": 0.225629061460495, "step": 1719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.65625, "completions/mean_terminated_length": 271.65625, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 0.0825602188706398, "epoch": 3.44, "frac_reward_zero_std": 0.0, "grad_norm": 0.0974683165550232, "learning_rate": 4.609291763579357e-07, "loss": -0.0025, "num_tokens": 15012401.0, "reward": 13.808998107910156, "reward_std": 0.5261107683181763, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7918394207954407, "rewards/length2tails_reward/std": 0.2661225199699402, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.75, "completions/mean_terminated_length": 272.75, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08152022119611502, "epoch": 3.442, "frac_reward_zero_std": 0.0, "grad_norm": 0.06283272057771683, "learning_rate": 4.5984960383408e-07, "loss": -0.0053, "num_tokens": 15021161.0, "reward": 13.772812843322754, "reward_std": 1.2733838558197021, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.78622967004776, "rewards/length2tails_reward/std": 0.2837279736995697, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.6065256595611572, "rewards/thermo_reward/std": 1.2588016986846924, "step": 1721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.40625, "completions/mean_terminated_length": 271.40625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08558083605021238, "epoch": 3.444, "frac_reward_zero_std": 0.0, "grad_norm": 0.11926688253879547, "learning_rate": 4.5877091945183143e-07, "loss": -0.0023, "num_tokens": 15029878.0, "reward": 13.610316276550293, "reward_std": 1.2155641317367554, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7381070852279663, "rewards/length2tails_reward/std": 0.24422591924667358, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.448840856552124, "rewards/thermo_reward/std": 1.20847487449646, "step": 1722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.15625, "completions/mean_terminated_length": 272.15625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08817206136882305, "epoch": 3.446, "frac_reward_zero_std": 0.0, "grad_norm": 0.07041650265455246, "learning_rate": 4.576931249848155e-07, "loss": -0.0032, "num_tokens": 15038619.0, "reward": 13.744803428649902, "reward_std": 0.8200621008872986, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.536046028137207, "rewards/kidney_reward/std": 0.5115687847137451, "rewards/length2tails_reward/mean": 0.7806336879730225, "rewards/length2tails_reward/std": 0.2577957212924957, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "entropy": 0.08811759203672409, "epoch": 3.448, "frac_reward_zero_std": 0.0, "grad_norm": 0.3796294927597046, "learning_rate": 4.5661622220519455e-07, "loss": -0.0211, "num_tokens": 15047347.0, "reward": 13.921072006225586, "reward_std": 0.3113763928413391, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8412730693817139, "rewards/length2tails_reward/std": 0.22260740399360657, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.0625, "completions/mean_terminated_length": 273.0625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08379036001861095, "epoch": 3.45, "frac_reward_zero_std": 0.0, "grad_norm": 0.07183808833360672, "learning_rate": 4.555402128836642e-07, "loss": -0.0059, "num_tokens": 15056117.0, "reward": 13.833101272583008, "reward_std": 0.7487102746963501, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8102487325668335, "rewards/length2tails_reward/std": 0.24392889440059662, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.3125, "completions/mean_terminated_length": 271.3125, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.08676104340702295, "epoch": 3.452, "frac_reward_zero_std": 0.0, "grad_norm": 0.07923120260238647, "learning_rate": 4.544650987894514e-07, "loss": -0.0054, "num_tokens": 15064831.0, "reward": 13.915239334106445, "reward_std": 0.32995709776878357, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.782934844493866, "rewards/length2tails_reward/std": 0.2695567309856415, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 274.0625, "completions/mean_terminated_length": 274.0625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.10370921809226274, "epoch": 3.454, "frac_reward_zero_std": 0.0, "grad_norm": 0.0856776013970375, "learning_rate": 4.533908816903115e-07, "loss": -0.0002, "num_tokens": 15073633.0, "reward": 13.965975761413574, "reward_std": 0.22475717961788177, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8914409279823303, "rewards/length2tails_reward/std": 0.1631007343530655, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7891664505004883, "rewards/thermo_reward/std": 0.225629061460495, "step": 1727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.09375, "completions/mean_terminated_length": 272.09375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08512784168124199, "epoch": 3.456, "frac_reward_zero_std": 0.0, "grad_norm": 0.07340458035469055, "learning_rate": 4.5231756335252433e-07, "loss": -0.0011, "num_tokens": 15082372.0, "reward": 13.631868362426758, "reward_std": 1.0840603113174438, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7518198490142822, "rewards/length2tails_reward/std": 0.27484434843063354, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4963815212249756, "rewards/thermo_reward/std": 1.0421310663223267, "step": 1728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.71875, "completions/mean_terminated_length": 272.71875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.07721906714141369, "epoch": 3.458, "frac_reward_zero_std": 0.0, "grad_norm": 0.20975400507450104, "learning_rate": 4.512451455408929e-07, "loss": -0.0056, "num_tokens": 15091131.0, "reward": 12.620357513427734, "reward_std": 3.4667251110076904, "rewards/fitness_reward/mean": 6.938035011291504, "rewards/fitness_reward/std": 1.840762972831726, "rewards/kidney_reward/mean": 2.36350679397583, "rewards/kidney_reward/std": 0.737473726272583, "rewards/length2tails_reward/mean": 0.7722162008285522, "rewards/length2tails_reward/std": 0.3076375722885132, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.141594171524048, "rewards/thermo_reward/std": 1.4435781240463257, "step": 1729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 272.03125, "completions/mean_terminated_length": 272.03125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0899735875427723, "epoch": 3.46, "frac_reward_zero_std": 0.0, "grad_norm": 0.16601188480854034, "learning_rate": 4.5017363001873774e-07, "loss": 0.0053, "num_tokens": 15099868.0, "reward": 13.60742473602295, "reward_std": 0.6418152451515198, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7704048156738281, "rewards/length2tails_reward/std": 0.24701431393623352, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.470078945159912, "rewards/thermo_reward/std": 0.5830413699150085, "step": 1730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 275.71875, "completions/mean_terminated_length": 275.71875, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.09080104809254408, "epoch": 3.462, "frac_reward_zero_std": 0.0, "grad_norm": 0.16605244576931, "learning_rate": 4.4910301854789755e-07, "loss": -0.0038, "num_tokens": 15108723.0, "reward": 13.880746841430664, "reward_std": 0.3769935965538025, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8368751406669617, "rewards/length2tails_reward/std": 0.21261143684387207, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.9375, "completions/mean_terminated_length": 272.9375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.07802405953407288, "epoch": 3.464, "frac_reward_zero_std": 0.0, "grad_norm": 0.06647925823926926, "learning_rate": 4.480333128887237e-07, "loss": -0.0054, "num_tokens": 15117489.0, "reward": 13.134675979614258, "reward_std": 2.7280941009521484, "rewards/fitness_reward/mean": 7.047297477722168, "rewards/fitness_reward/std": 1.7756171226501465, "rewards/kidney_reward/mean": 2.478243827819824, "rewards/kidney_reward/std": 0.5618298053741455, "rewards/length2tails_reward/mean": 0.7867090702056885, "rewards/length2tails_reward/std": 0.27397066354751587, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.430464267730713, "rewards/thermo_reward/std": 0.9946144819259644, "step": 1732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.21875, "completions/mean_terminated_length": 272.21875, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 0.0766105898655951, "epoch": 3.466, "frac_reward_zero_std": 0.0, "grad_norm": 0.07114367187023163, "learning_rate": 4.4696451480007846e-07, "loss": 0.0035, "num_tokens": 15126232.0, "reward": 13.954242706298828, "reward_std": 0.22321708500385284, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7741064429283142, "rewards/length2tails_reward/std": 0.2730728089809418, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7891664505004883, "rewards/thermo_reward/std": 0.225629061460495, "step": 1733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.8125, "completions/mean_terminated_length": 270.8125, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "entropy": 0.09087709616869688, "epoch": 3.468, "frac_reward_zero_std": 0.0, "grad_norm": 0.11216463148593903, "learning_rate": 4.458966260393322e-07, "loss": -0.0007, "num_tokens": 15134930.0, "reward": 13.91307544708252, "reward_std": 0.3138239085674286, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7613010406494141, "rewards/length2tails_reward/std": 0.2676401138305664, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.78125, "completions/mean_terminated_length": 272.78125, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "entropy": 0.08705608453601599, "epoch": 3.4699999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.1364695131778717, "learning_rate": 4.448296483623587e-07, "loss": 0.0003, "num_tokens": 15143691.0, "reward": 13.93537425994873, "reward_std": 0.37957563996315, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8590216636657715, "rewards/length2tails_reward/std": 0.20525671541690826, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7891664505004883, "rewards/thermo_reward/std": 0.225629061460495, "step": 1735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.5625, "completions/mean_terminated_length": 271.5625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.07921055890619755, "epoch": 3.472, "frac_reward_zero_std": 0.0, "grad_norm": 0.17655280232429504, "learning_rate": 4.4376358352353526e-07, "loss": -0.0037, "num_tokens": 15152413.0, "reward": 13.572860717773438, "reward_std": 1.2297868728637695, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7480850219726562, "rewards/length2tails_reward/std": 0.2410997599363327, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.495255708694458, "rewards/thermo_reward/std": 1.0479415655136108, "step": 1736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.40625, "completions/mean_terminated_length": 273.40625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08936100918799639, "epoch": 3.474, "frac_reward_zero_std": 0.0, "grad_norm": 0.09362028539180756, "learning_rate": 4.4269843327573743e-07, "loss": -0.0006, "num_tokens": 15161194.0, "reward": 12.85144329071045, "reward_std": 4.335888385772705, "rewards/fitness_reward/mean": 7.02105712890625, "rewards/fitness_reward/std": 1.924055814743042, "rewards/kidney_reward/mean": 2.4442200660705566, "rewards/kidney_reward/std": 0.8848810791969299, "rewards/length2tails_reward/mean": 0.8391945362091064, "rewards/length2tails_reward/std": 0.20115543901920319, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2022478580474854, "rewards/thermo_reward/std": 1.7934486865997314, "step": 1737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 283.625, "completions/mean_terminated_length": 283.625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08751403260976076, "epoch": 3.476, "frac_reward_zero_std": 0.0, "grad_norm": 0.29367560148239136, "learning_rate": 4.416341993703373e-07, "loss": -0.0084, "num_tokens": 15170302.0, "reward": 13.098956108093262, "reward_std": 4.083829879760742, "rewards/fitness_reward/mean": 7.032780170440674, "rewards/fitness_reward/std": 1.8577386140823364, "rewards/kidney_reward/mean": 2.4100849628448486, "rewards/kidney_reward/std": 0.9298133254051208, "rewards/length2tails_reward/mean": 0.7794876098632812, "rewards/length2tails_reward/std": 0.2651849091053009, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4781417846679688, "rewards/thermo_reward/std": 1.340535044670105, "step": 1738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.59375, "completions/mean_terminated_length": 271.59375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09130069054663181, "epoch": 3.4779999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.14665277302265167, "learning_rate": 4.405708835572005e-07, "loss": 0.001, "num_tokens": 15179025.0, "reward": 13.806692123413086, "reward_std": 0.5210264325141907, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.768774151802063, "rewards/length2tails_reward/std": 0.26261356472969055, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.53125, "completions/mean_terminated_length": 272.53125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08416412118822336, "epoch": 3.48, "frac_reward_zero_std": 0.0, "grad_norm": 0.1830289661884308, "learning_rate": 4.39508487584682e-07, "loss": 0.0001, "num_tokens": 15187778.0, "reward": 13.80591869354248, "reward_std": 0.5232962369918823, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7610487937927246, "rewards/length2tails_reward/std": 0.2874740958213806, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.4375, "completions/mean_terminated_length": 271.4375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.08090137969702482, "epoch": 3.482, "frac_reward_zero_std": 0.0, "grad_norm": 0.10863588005304337, "learning_rate": 4.384470131996252e-07, "loss": 0.0004, "num_tokens": 15196496.0, "reward": 13.718465805053711, "reward_std": 0.5391813516616821, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8095073699951172, "rewards/length2tails_reward/std": 0.2026718258857727, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5498507022857666, "rewards/thermo_reward/std": 0.5360844731330872, "step": 1741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.53125, "completions/mean_terminated_length": 270.53125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "entropy": 0.085418657399714, "epoch": 3.484, "frac_reward_zero_std": 0.0, "grad_norm": 0.1338280588388443, "learning_rate": 4.3738646214735864e-07, "loss": -0.0165, "num_tokens": 15205185.0, "reward": 13.87672233581543, "reward_std": 0.3799758553504944, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7966322898864746, "rewards/length2tails_reward/std": 0.20604509115219116, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.84375, "completions/mean_terminated_length": 272.84375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08169888611882925, "epoch": 3.4859999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.07724236696958542, "learning_rate": 4.363268361716912e-07, "loss": -0.0017, "num_tokens": 15213948.0, "reward": 13.66454792022705, "reward_std": 0.8659971356391907, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8111091256141663, "rewards/length2tails_reward/std": 0.21726176142692566, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.495771646499634, "rewards/thermo_reward/std": 0.8601759076118469, "step": 1743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.5625, "completions/mean_terminated_length": 272.5625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08383139874786139, "epoch": 3.488, "frac_reward_zero_std": 0.0, "grad_norm": 0.11576797813177109, "learning_rate": 4.3526813701491183e-07, "loss": -0.0033, "num_tokens": 15222702.0, "reward": 13.91389274597168, "reward_std": 0.32040977478027344, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7694723606109619, "rewards/length2tails_reward/std": 0.28133609890937805, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 274.15625, "completions/mean_terminated_length": 274.15625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.0834615072235465, "epoch": 3.49, "frac_reward_zero_std": 0.0, "grad_norm": 0.18630819022655487, "learning_rate": 4.3421036641778553e-07, "loss": 0.0013, "num_tokens": 15231507.0, "reward": 13.205314636230469, "reward_std": 2.668398380279541, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.3939995765686035, "rewards/kidney_reward/std": 0.9162217378616333, "rewards/length2tails_reward/mean": 0.8655220866203308, "rewards/length2tails_reward/std": 0.20133092999458313, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.321087598800659, "rewards/thermo_reward/std": 1.5478066205978394, "step": 1745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.625, "completions/mean_terminated_length": 271.625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08705971017479897, "epoch": 3.492, "frac_reward_zero_std": 0.0, "grad_norm": 0.08833774924278259, "learning_rate": 4.3315352611955035e-07, "loss": 0.0012, "num_tokens": 15240231.0, "reward": 13.732002258300781, "reward_std": 1.0502101182937622, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7496770024299622, "rewards/length2tails_reward/std": 0.27922266721725464, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.596729278564453, "rewards/thermo_reward/std": 0.9044937491416931, "step": 1746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.46875, "completions/mean_terminated_length": 272.46875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08123445883393288, "epoch": 3.4939999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.12158084660768509, "learning_rate": 4.320976178579141e-07, "loss": 0.0005, "num_tokens": 15248982.0, "reward": 13.316404342651367, "reward_std": 1.9671152830123901, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5067176818847656, "rewards/kidney_reward/std": 0.5403326153755188, "rewards/length2tails_reward/mean": 0.7520942687988281, "rewards/length2tails_reward/std": 0.2965957820415497, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.273292303085327, "rewards/thermo_reward/std": 1.465672254562378, "step": 1747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 273.0625, "completions/mean_terminated_length": 273.0625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08086736872792244, "epoch": 3.496, "frac_reward_zero_std": 0.0, "grad_norm": 0.09447924047708511, "learning_rate": 4.310426433690528e-07, "loss": 0.0068, "num_tokens": 15257752.0, "reward": 13.811731338500977, "reward_std": 0.5163472890853882, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8191708326339722, "rewards/length2tails_reward/std": 0.21201351284980774, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.34375, "completions/mean_terminated_length": 273.34375, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.07997060008347034, "epoch": 3.498, "frac_reward_zero_std": 0.0, "grad_norm": 0.09858963638544083, "learning_rate": 4.299886043876071e-07, "loss": -0.0017, "num_tokens": 15266531.0, "reward": 13.764122009277344, "reward_std": 0.50864577293396, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8671994209289551, "rewards/length2tails_reward/std": 0.12816838920116425, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 1749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 269.59375, "completions/mean_terminated_length": 269.59375, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.08890233561396599, "epoch": 3.5, "frac_reward_zero_std": 0.0, "grad_norm": 0.16905415058135986, "learning_rate": 4.289355026466791e-07, "loss": 0.0071, "num_tokens": 15275190.0, "reward": 13.854276657104492, "reward_std": 0.4811854362487793, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8457736968994141, "rewards/length2tails_reward/std": 0.1967909038066864, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0906071225181222, "epoch": 3.502, "frac_reward_zero_std": 0.0, "grad_norm": 0.09274038672447205, "learning_rate": 4.278833398778305e-07, "loss": -0.006, "num_tokens": 15283938.0, "reward": 13.834604263305664, "reward_std": 0.4413672685623169, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7743013501167297, "rewards/length2tails_reward/std": 0.2758329510688782, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 282.21875, "completions/mean_terminated_length": 282.21875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08395624533295631, "epoch": 3.504, "frac_reward_zero_std": 0.0, "grad_norm": 0.4823765456676483, "learning_rate": 4.2683211781107785e-07, "loss": -0.0172, "num_tokens": 15293001.0, "reward": 13.870827674865723, "reward_std": 0.3858233094215393, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7376867532730103, "rewards/length2tails_reward/std": 0.2750190496444702, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 271.03125, "completions/mean_terminated_length": 271.03125, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 0.08187512308359146, "epoch": 3.5060000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.11266060173511505, "learning_rate": 4.257818381748921e-07, "loss": -0.0006, "num_tokens": 15301706.0, "reward": 13.875701904296875, "reward_std": 0.38500985503196716, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7864289283752441, "rewards/length2tails_reward/std": 0.23578976094722748, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.5625, "completions/mean_terminated_length": 270.5625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.08811250422149897, "epoch": 3.508, "frac_reward_zero_std": 0.0, "grad_norm": 0.08882928639650345, "learning_rate": 4.247325026961941e-07, "loss": 0.0026, "num_tokens": 15310396.0, "reward": 13.953454971313477, "reward_std": 0.22397710382938385, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7662337422370911, "rewards/length2tails_reward/std": 0.25318098068237305, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7891664505004883, "rewards/thermo_reward/std": 0.225629061460495, "step": 1754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 0.08091798983514309, "epoch": 3.51, "frac_reward_zero_std": 0.0, "grad_norm": 0.07322859764099121, "learning_rate": 4.2368411310035237e-07, "loss": -0.004, "num_tokens": 15319112.0, "reward": 13.878302574157715, "reward_std": 0.3881266117095947, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8124330043792725, "rewards/length2tails_reward/std": 0.23422132432460785, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.0625, "completions/mean_terminated_length": 270.0625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.08312960062175989, "epoch": 3.512, "frac_reward_zero_std": 0.0, "grad_norm": 0.20515073835849762, "learning_rate": 4.226366711111807e-07, "loss": -0.0256, "num_tokens": 15327786.0, "reward": 13.440038681030273, "reward_std": 2.7264461517333984, "rewards/fitness_reward/mean": 7.043000221252441, "rewards/fitness_reward/std": 1.7999252080917358, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.849888026714325, "rewards/length2tails_reward/std": 0.1674598753452301, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.612929105758667, "rewards/thermo_reward/std": 0.8191527128219604, "step": 1756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.21875, "completions/mean_terminated_length": 273.21875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08522853534668684, "epoch": 3.5140000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.12777894735336304, "learning_rate": 4.2159017845093346e-07, "loss": 0.0014, "num_tokens": 15336561.0, "reward": 13.880290031433105, "reward_std": 0.3749031126499176, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8323098421096802, "rewards/length2tails_reward/std": 0.2247202843427658, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.34375, "completions/mean_terminated_length": 272.34375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08242740109562874, "epoch": 3.516, "frac_reward_zero_std": 0.0, "grad_norm": 0.10702885687351227, "learning_rate": 4.205446368403052e-07, "loss": -0.0004, "num_tokens": 15345308.0, "reward": 13.673827171325684, "reward_std": 0.9926640391349792, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8025897741317749, "rewards/length2tails_reward/std": 0.2218964546918869, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5332627296447754, "rewards/thermo_reward/std": 0.8561917543411255, "step": 1758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 270.90625, "completions/mean_terminated_length": 270.90625, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "entropy": 0.10247748345136642, "epoch": 3.518, "frac_reward_zero_std": 0.0, "grad_norm": 0.5401629209518433, "learning_rate": 4.195000479984264e-07, "loss": 0.0074, "num_tokens": 15354009.0, "reward": 13.805831909179688, "reward_std": 0.5156649351119995, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7601784467697144, "rewards/length2tails_reward/std": 0.23915334045886993, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 282.84375, "completions/mean_terminated_length": 282.84375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08352225320413709, "epoch": 3.52, "frac_reward_zero_std": 0.0, "grad_norm": 0.10053406655788422, "learning_rate": 4.184564136428611e-07, "loss": -0.0128, "num_tokens": 15363092.0, "reward": 13.87208366394043, "reward_std": 0.375398188829422, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.750244677066803, "rewards/length2tails_reward/std": 0.27107128500938416, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.5, "completions/mean_terminated_length": 272.5, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09472567960619926, "epoch": 3.5220000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.13488849997520447, "learning_rate": 4.174137354896039e-07, "loss": -0.0018, "num_tokens": 15371844.0, "reward": 13.780675888061523, "reward_std": 0.5977986454963684, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7822105884552002, "rewards/length2tails_reward/std": 0.24083751440048218, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 274.125, "completions/mean_terminated_length": 274.125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.07826209580525756, "epoch": 3.524, "frac_reward_zero_std": 0.0, "grad_norm": 0.06917013227939606, "learning_rate": 4.163720152530765e-07, "loss": 0.0035, "num_tokens": 15380648.0, "reward": 13.924861907958984, "reward_std": 0.3114674389362335, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8791612386703491, "rewards/length2tails_reward/std": 0.19188782572746277, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.34375, "completions/mean_terminated_length": 273.34375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09946441370993853, "epoch": 3.526, "frac_reward_zero_std": 0.0, "grad_norm": 0.13247686624526978, "learning_rate": 4.153312546461264e-07, "loss": 0.0046, "num_tokens": 15389427.0, "reward": 13.841333389282227, "reward_std": 0.4241860806941986, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8416051864624023, "rewards/length2tails_reward/std": 0.1799822300672531, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.28125, "completions/mean_terminated_length": 273.28125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08188586542382836, "epoch": 3.528, "frac_reward_zero_std": 0.0, "grad_norm": 0.09976492822170258, "learning_rate": 4.142914553800232e-07, "loss": -0.0034, "num_tokens": 15398204.0, "reward": 13.013737678527832, "reward_std": 4.373779296875, "rewards/fitness_reward/mean": 7.029188632965088, "rewards/fitness_reward/std": 1.8780547380447388, "rewards/kidney_reward/mean": 2.406290054321289, "rewards/kidney_reward/std": 0.9507110118865967, "rewards/length2tails_reward/mean": 0.8287488222122192, "rewards/length2tails_reward/std": 0.2133997231721878, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.395383596420288, "rewards/thermo_reward/std": 1.5805878639221191, "step": 1764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.1875, "completions/mean_terminated_length": 272.1875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08668717928230762, "epoch": 3.5300000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.07370003312826157, "learning_rate": 4.132526191644549e-07, "loss": -0.0018, "num_tokens": 15406946.0, "reward": 13.925870895385742, "reward_std": 0.3816991448402405, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7639847993850708, "rewards/length2tails_reward/std": 0.2842487096786499, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7891664505004883, "rewards/thermo_reward/std": 0.225629061460495, "step": 1765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 285.375, "completions/mean_terminated_length": 285.375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09249016921967268, "epoch": 3.532, "frac_reward_zero_std": 0.0, "grad_norm": 0.2883523404598236, "learning_rate": 4.1221474770752696e-07, "loss": -0.0193, "num_tokens": 15416110.0, "reward": 13.796365737915039, "reward_std": 0.46921226382255554, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.790784478187561, "rewards/length2tails_reward/std": 0.24667830765247345, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.34375, "completions/mean_terminated_length": 272.34375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08595772087574005, "epoch": 3.534, "frac_reward_zero_std": 0.0, "grad_norm": 0.07527884840965271, "learning_rate": 4.111778427157585e-07, "loss": -0.0068, "num_tokens": 15424857.0, "reward": 13.066018104553223, "reward_std": 3.571143865585327, "rewards/fitness_reward/mean": 6.99554443359375, "rewards/fitness_reward/std": 1.7628074884414673, "rewards/kidney_reward/mean": 2.4402666091918945, "rewards/kidney_reward/std": 0.7342117428779602, "rewards/length2tails_reward/mean": 0.7519161105155945, "rewards/length2tails_reward/std": 0.31728798151016235, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4550156593322754, "rewards/thermo_reward/std": 1.3517484664916992, "step": 1767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.9375, "completions/mean_terminated_length": 270.9375, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "entropy": 0.09108984749764204, "epoch": 3.536, "frac_reward_zero_std": 0.0, "grad_norm": 0.2417386919260025, "learning_rate": 4.101419058940786e-07, "loss": -0.0202, "num_tokens": 15433559.0, "reward": 13.393560409545898, "reward_std": 1.687468409538269, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.7939262986183167, "rewards/length2tails_reward/std": 0.22488641738891602, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3085813522338867, "rewards/thermo_reward/std": 1.4648016691207886, "step": 1768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.5625, "completions/mean_terminated_length": 273.5625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08521412871778011, "epoch": 3.5380000000000003, "frac_reward_zero_std": 0.0, "grad_norm": 0.10136404633522034, "learning_rate": 4.0910693894582547e-07, "loss": -0.0022, "num_tokens": 15442345.0, "reward": 13.412859916687012, "reward_std": 1.929201364517212, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5046486854553223, "rewards/kidney_reward/std": 0.5515561699867249, "rewards/length2tails_reward/mean": 0.8514991998672485, "rewards/length2tails_reward/std": 0.2045324146747589, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3618760108947754, "rewards/thermo_reward/std": 1.4068026542663574, "step": 1769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.4375, "completions/mean_terminated_length": 271.4375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08393862191587687, "epoch": 3.54, "frac_reward_zero_std": 0.0, "grad_norm": 0.11011866480112076, "learning_rate": 4.0807294357274214e-07, "loss": 0.0059, "num_tokens": 15451063.0, "reward": 13.429059982299805, "reward_std": 2.498326301574707, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.650642991065979, "rewards/kidney_reward/mean": 2.493276596069336, "rewards/kidney_reward/std": 0.7535098791122437, "rewards/length2tails_reward/mean": 0.7319997549057007, "rewards/length2tails_reward/std": 0.26997649669647217, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.516416549682617, "rewards/thermo_reward/std": 1.1344144344329834, "step": 1770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.5625, "completions/mean_terminated_length": 272.5625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08664077240973711, "epoch": 3.542, "frac_reward_zero_std": 0.0, "grad_norm": 0.11247962713241577, "learning_rate": 4.0703992147497426e-07, "loss": -0.0015, "num_tokens": 15459817.0, "reward": 13.916162490844727, "reward_std": 0.31563135981559753, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7921739816665649, "rewards/length2tails_reward/std": 0.2422455996274948, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.96875, "completions/mean_terminated_length": 271.96875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.08142776181921363, "epoch": 3.544, "frac_reward_zero_std": 0.0, "grad_norm": 0.10102333128452301, "learning_rate": 4.060078743510671e-07, "loss": -0.0003, "num_tokens": 15468552.0, "reward": 13.715620040893555, "reward_std": 0.5393779873847961, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7810434699058533, "rewards/length2tails_reward/std": 0.19498248398303986, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5498504638671875, "rewards/thermo_reward/std": 0.5360844731330872, "step": 1772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.03125, "completions/mean_terminated_length": 271.03125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08150176517665386, "epoch": 3.5460000000000003, "frac_reward_zero_std": 0.0, "grad_norm": 0.08746400475502014, "learning_rate": 4.049768038979631e-07, "loss": -0.0048, "num_tokens": 15477257.0, "reward": 13.692277908325195, "reward_std": 1.6817798614501953, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5264596939086914, "rewards/kidney_reward/std": 0.5657978057861328, "rewards/length2tails_reward/mean": 0.7054793238639832, "rewards/length2tails_reward/std": 0.2941136062145233, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.634085178375244, "rewards/thermo_reward/std": 1.1029006242752075, "step": 1773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.9375, "completions/mean_terminated_length": 272.9375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09111022902652621, "epoch": 3.548, "frac_reward_zero_std": 0.0, "grad_norm": 0.12175245583057404, "learning_rate": 4.0394671181099783e-07, "loss": -0.0007, "num_tokens": 15486023.0, "reward": 13.690387725830078, "reward_std": 0.6123297214508057, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8023138046264648, "rewards/length2tails_reward/std": 0.26038527488708496, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5498507022857666, "rewards/thermo_reward/std": 0.5360844731330872, "step": 1774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.46875, "completions/mean_terminated_length": 272.46875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08034937083721161, "epoch": 3.55, "frac_reward_zero_std": 0.0, "grad_norm": 0.09713394939899445, "learning_rate": 4.029175997838995e-07, "loss": -0.0137, "num_tokens": 15494774.0, "reward": 13.99374771118164, "reward_std": 0.02433871291577816, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7703039050102234, "rewards/length2tails_reward/std": 0.24338841438293457, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.829052448272705, "rewards/thermo_reward/std": 0.0, "step": 1775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.78125, "completions/mean_terminated_length": 271.78125, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "entropy": 0.09106822917237878, "epoch": 3.552, "frac_reward_zero_std": 0.0, "grad_norm": 0.08475534617900848, "learning_rate": 4.01889469508784e-07, "loss": -0.0007, "num_tokens": 15503503.0, "reward": 13.563141822814941, "reward_std": 1.5742127895355225, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.526200532913208, "rewards/kidney_reward/std": 0.567264199256897, "rewards/length2tails_reward/mean": 0.7878760099411011, "rewards/length2tails_reward/std": 0.2587636709213257, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.496969223022461, "rewards/thermo_reward/std": 1.0391004085540771, "step": 1776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 271.84375, "completions/mean_terminated_length": 271.84375, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 0.08386838296428323, "epoch": 3.5540000000000003, "frac_reward_zero_std": 0.0, "grad_norm": 0.10282585024833679, "learning_rate": 4.008623226761534e-07, "loss": 0.0002, "num_tokens": 15512234.0, "reward": 13.76917839050293, "reward_std": 0.5571933388710022, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7925007343292236, "rewards/length2tails_reward/std": 0.2730444669723511, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 270.9375, "completions/mean_terminated_length": 270.9375, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "entropy": 0.0853539565578103, "epoch": 3.556, "frac_reward_zero_std": 0.0, "grad_norm": 0.10087975859642029, "learning_rate": 3.998361609748928e-07, "loss": -0.0026, "num_tokens": 15520936.0, "reward": 13.262292861938477, "reward_std": 2.375058889389038, "rewards/fitness_reward/mean": 7.052567481994629, "rewards/fitness_reward/std": 1.7458053827285767, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7606244683265686, "rewards/length2tails_reward/std": 0.27506399154663086, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4619014263153076, "rewards/thermo_reward/std": 1.0260679721832275, "step": 1778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 0.08862397447228432, "epoch": 3.558, "frac_reward_zero_std": 0.0, "grad_norm": 0.10122814774513245, "learning_rate": 3.988109860922666e-07, "loss": 0.0033, "num_tokens": 15529700.0, "reward": 13.856950759887695, "reward_std": 0.4788967967033386, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8725180625915527, "rewards/length2tails_reward/std": 0.21750660240650177, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 274.59375, "completions/mean_terminated_length": 274.59375, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.08838807186111808, "epoch": 3.56, "frac_reward_zero_std": 0.0, "grad_norm": 0.3841480612754822, "learning_rate": 3.977867997139178e-07, "loss": -0.0002, "num_tokens": 15538519.0, "reward": 13.84387493133545, "reward_std": 0.4256982207298279, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8670163154602051, "rewards/length2tails_reward/std": 0.1517139971256256, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.0, "completions/mean_terminated_length": 272.0, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 0.08908396679908037, "epoch": 3.5620000000000003, "frac_reward_zero_std": 0.0, "grad_norm": 0.1281057596206665, "learning_rate": 3.967636035238635e-07, "loss": 0.0055, "num_tokens": 15547255.0, "reward": 12.93602180480957, "reward_std": 5.5553297996521, "rewards/fitness_reward/mean": 6.948566436767578, "rewards/fitness_reward/std": 2.3341219425201416, "rewards/kidney_reward/mean": 2.361267328262329, "rewards/kidney_reward/std": 1.500267744064331, "rewards/length2tails_reward/mean": 0.8272075653076172, "rewards/length2tails_reward/std": 0.20161886513233185, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.443466901779175, "rewards/thermo_reward/std": 1.7438091039657593, "step": 1781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.15625, "completions/mean_terminated_length": 271.15625, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "entropy": 0.08590043894946575, "epoch": 3.564, "frac_reward_zero_std": 0.0, "grad_norm": 0.12525677680969238, "learning_rate": 3.9574139920449267e-07, "loss": 0.0048, "num_tokens": 15555964.0, "reward": 13.8359375, "reward_std": 0.4279092848300934, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7876396179199219, "rewards/length2tails_reward/std": 0.22690580785274506, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 274.34375, "completions/mean_terminated_length": 274.34375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08884886186569929, "epoch": 3.566, "frac_reward_zero_std": 0.0, "grad_norm": 0.09117615222930908, "learning_rate": 3.947201884365639e-07, "loss": -0.0038, "num_tokens": 15564775.0, "reward": 13.65673828125, "reward_std": 1.285060167312622, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7954223155975342, "rewards/length2tails_reward/std": 0.26181408762931824, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.516890525817871, "rewards/thermo_reward/std": 1.1318851709365845, "step": 1783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.28125, "completions/mean_terminated_length": 270.28125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "entropy": 0.08355098683387041, "epoch": 3.568, "frac_reward_zero_std": 0.0, "grad_norm": 0.09611167013645172, "learning_rate": 3.9369997289920085e-07, "loss": -0.0048, "num_tokens": 15573456.0, "reward": 13.460000991821289, "reward_std": 1.3074196577072144, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.814959704875946, "rewards/length2tails_reward/std": 0.24196474254131317, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3181991577148438, "rewards/thermo_reward/std": 1.271405816078186, "step": 1784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.59375, "completions/mean_terminated_length": 272.59375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09087568428367376, "epoch": 3.57, "frac_reward_zero_std": 0.0, "grad_norm": 0.12429216504096985, "learning_rate": 3.926807542698922e-07, "loss": 0.003, "num_tokens": 15582211.0, "reward": 13.4776029586792, "reward_std": 0.94354647397995, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8276777863502502, "rewards/length2tails_reward/std": 0.20748263597488403, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.334529399871826, "rewards/thermo_reward/std": 0.9096359610557556, "step": 1785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0920605594292283, "epoch": 3.572, "frac_reward_zero_std": 0.0, "grad_norm": 0.10412466526031494, "learning_rate": 3.916625342244868e-07, "loss": 0.0016, "num_tokens": 15590939.0, "reward": 13.483139038085938, "reward_std": 1.349585771560669, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7391869425773621, "rewards/length2tails_reward/std": 0.28878656029701233, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4064245223999023, "rewards/thermo_reward/std": 0.9289711713790894, "step": 1786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 754.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 287.40625, "completions/mean_terminated_length": 272.3548278808594, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09477597661316395, "epoch": 3.574, "frac_reward_zero_std": 0.0, "grad_norm": 1.0864534378051758, "learning_rate": 3.9064531443719194e-07, "loss": -0.0165, "num_tokens": 15600168.0, "reward": 13.918493270874023, "reward_std": 0.31318578124046326, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8154876232147217, "rewards/length2tails_reward/std": 0.2113732397556305, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.3125, "completions/mean_terminated_length": 273.3125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08148432523012161, "epoch": 3.576, "frac_reward_zero_std": 0.0, "grad_norm": 0.0958944708108902, "learning_rate": 3.8962909658056944e-07, "loss": -0.0071, "num_tokens": 15608946.0, "reward": 13.712947845458984, "reward_std": 0.8171798586845398, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8052818775177002, "rewards/length2tails_reward/std": 0.2560892701148987, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.71875, "completions/mean_terminated_length": 273.71875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08467878680676222, "epoch": 3.578, "frac_reward_zero_std": 0.0, "grad_norm": 0.06395456194877625, "learning_rate": 3.886138823255348e-07, "loss": -0.0045, "num_tokens": 15617737.0, "reward": 13.70904541015625, "reward_std": 1.2468868494033813, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8645087480545044, "rewards/length2tails_reward/std": 0.17950421571731567, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5622894763946533, "rewards/thermo_reward/std": 1.0893287658691406, "step": 1789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.09077019430696964, "epoch": 3.58, "frac_reward_zero_std": 0.0, "grad_norm": 0.10219375044107437, "learning_rate": 3.8759967334135214e-07, "loss": 0.0043, "num_tokens": 15626485.0, "reward": 12.912355422973633, "reward_std": 4.981294631958008, "rewards/fitness_reward/mean": 6.975485801696777, "rewards/fitness_reward/std": 2.18184494972229, "rewards/kidney_reward/mean": 2.4065518379211426, "rewards/kidney_reward/std": 1.244099736213684, "rewards/length2tails_reward/mean": 0.788223147392273, "rewards/length2tails_reward/std": 0.2414843589067459, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3514962196350098, "rewards/thermo_reward/std": 1.6070235967636108, "step": 1790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 283.28125, "completions/mean_terminated_length": 283.28125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08667771378532052, "epoch": 3.582, "frac_reward_zero_std": 0.0, "grad_norm": 0.4496097266674042, "learning_rate": 3.865864712956336e-07, "loss": -0.0498, "num_tokens": 15635582.0, "reward": 13.995429992675781, "reward_std": 0.02127438597381115, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.787126898765564, "rewards/length2tails_reward/std": 0.21274513006210327, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.829052448272705, "rewards/thermo_reward/std": 0.0, "step": 1791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.75, "completions/mean_terminated_length": 273.75, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 0.08310918603092432, "epoch": 3.584, "frac_reward_zero_std": 0.0, "grad_norm": 0.09372947365045547, "learning_rate": 3.8557427785433536e-07, "loss": 0.0049, "num_tokens": 15644374.0, "reward": 13.748785018920898, "reward_std": 0.6206957101821899, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8621543645858765, "rewards/length2tails_reward/std": 0.21344423294067383, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.21875, "completions/mean_terminated_length": 272.21875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08902006130665541, "epoch": 3.586, "frac_reward_zero_std": 0.0, "grad_norm": 0.10767589509487152, "learning_rate": 3.8456309468175527e-07, "loss": 0.0, "num_tokens": 15653117.0, "reward": 13.834826469421387, "reward_std": 0.4260368347167969, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7765315771102905, "rewards/length2tails_reward/std": 0.23316825926303864, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.625, "completions/mean_terminated_length": 271.625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "entropy": 0.08544992376118898, "epoch": 3.588, "frac_reward_zero_std": 0.0, "grad_norm": 0.2308015078306198, "learning_rate": 3.835529234405303e-07, "loss": -0.0079, "num_tokens": 15661841.0, "reward": 13.996078491210938, "reward_std": 0.022238049656152725, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7936158180236816, "rewards/length2tails_reward/std": 0.22238068282604218, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.829052448272705, "rewards/thermo_reward/std": 0.0, "step": 1794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 272.5, "completions/mean_terminated_length": 272.5, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08660221099853516, "epoch": 3.59, "frac_reward_zero_std": 0.0, "grad_norm": 0.14856624603271484, "learning_rate": 3.825437657916325e-07, "loss": 0.001, "num_tokens": 15670593.0, "reward": 13.877517700195312, "reward_std": 0.37583112716674805, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8045819401741028, "rewards/length2tails_reward/std": 0.22768016159534454, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 272.625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09035045560449362, "epoch": 3.592, "frac_reward_zero_std": 0.0, "grad_norm": 0.10779079794883728, "learning_rate": 3.815356233943685e-07, "loss": -0.0057, "num_tokens": 15679349.0, "reward": 13.757761001586914, "reward_std": 0.5190196633338928, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8035960793495178, "rewards/length2tails_reward/std": 0.27802371978759766, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 1796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 754.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 285.75, "completions/mean_terminated_length": 270.6451416015625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.10499814618378878, "epoch": 3.594, "frac_reward_zero_std": 0.0, "grad_norm": 1.3619617223739624, "learning_rate": 3.805284979063752e-07, "loss": -0.0203, "num_tokens": 15688525.0, "reward": 13.837220191955566, "reward_std": 0.42661750316619873, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8004651665687561, "rewards/length2tails_reward/std": 0.24887455999851227, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.0625, "completions/mean_terminated_length": 271.0625, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "entropy": 0.08877876028418541, "epoch": 3.596, "frac_reward_zero_std": 0.0, "grad_norm": 0.09175974875688553, "learning_rate": 3.7952239098361726e-07, "loss": 0.0021, "num_tokens": 15697231.0, "reward": 13.38138198852539, "reward_std": 1.7214365005493164, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5350708961486816, "rewards/kidney_reward/std": 0.5170859098434448, "rewards/length2tails_reward/mean": 0.8196229934692383, "rewards/length2tails_reward/std": 0.20097720623016357, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3031632900238037, "rewards/thermo_reward/std": 1.2440290451049805, "step": 1798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08055632095783949, "epoch": 3.598, "frac_reward_zero_std": 0.0, "grad_norm": 0.09719519317150116, "learning_rate": 3.7851730428038473e-07, "loss": -0.0062, "num_tokens": 15705995.0, "reward": 13.470977783203125, "reward_std": 1.5660966634750366, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8105138540267944, "rewards/length2tails_reward/std": 0.26108792424201965, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.356980562210083, "rewards/thermo_reward/std": 1.4137026071548462, "step": 1799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.84375, "completions/mean_terminated_length": 269.84375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 0.08724918495863676, "epoch": 3.6, "frac_reward_zero_std": 0.0, "grad_norm": 0.288025826215744, "learning_rate": 3.7751323944929057e-07, "loss": 0.008, "num_tokens": 15714662.0, "reward": 13.290872573852539, "reward_std": 2.4588677883148193, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.464883327484131, "rewards/kidney_reward/std": 0.7700634002685547, "rewards/length2tails_reward/mean": 0.8100026249885559, "rewards/length2tails_reward/std": 0.21141447126865387, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3413124084472656, "rewards/thermo_reward/std": 1.4779037237167358, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.03125, "completions/mean_terminated_length": 273.03125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08742036391049623, "epoch": 3.602, "frac_reward_zero_std": 0.0, "grad_norm": 0.21453461050987244, "learning_rate": 3.765101981412665e-07, "loss": 0.0058, "num_tokens": 15723431.0, "reward": 13.496167182922363, "reward_std": 1.5324259996414185, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5327911376953125, "rewards/kidney_reward/std": 0.5299828052520752, "rewards/length2tails_reward/mean": 0.828900933265686, "rewards/length2tails_reward/std": 0.20199020206928253, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4193015098571777, "rewards/thermo_reward/std": 1.0495328903198242, "step": 1801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.875, "completions/mean_terminated_length": 273.875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08902319148182869, "epoch": 3.604, "frac_reward_zero_std": 0.0, "grad_norm": 0.11332407593727112, "learning_rate": 3.755081820055621e-07, "loss": 0.0011, "num_tokens": 15732227.0, "reward": 13.84552001953125, "reward_std": 0.42908626794815063, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8834710121154785, "rewards/length2tails_reward/std": 0.16791842877864838, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.5625, "completions/mean_terminated_length": 270.5625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 0.08871565293520689, "epoch": 3.606, "frac_reward_zero_std": 0.0, "grad_norm": 0.07865909487009048, "learning_rate": 3.7450719268974127e-07, "loss": -0.0011, "num_tokens": 15740917.0, "reward": 12.999435424804688, "reward_std": 4.383616924285889, "rewards/fitness_reward/mean": 7.023216724395752, "rewards/fitness_reward/std": 1.911836862564087, "rewards/kidney_reward/mean": 2.4475834369659424, "rewards/kidney_reward/std": 1.011989712715149, "rewards/length2tails_reward/mean": 0.8283498287200928, "rewards/length2tails_reward/std": 0.18109019100666046, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.345799446105957, "rewards/thermo_reward/std": 1.6405893564224243, "step": 1803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.78125, "completions/mean_terminated_length": 273.78125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08313896227627993, "epoch": 3.608, "frac_reward_zero_std": 0.0, "grad_norm": 0.13741354644298553, "learning_rate": 3.7350723183967935e-07, "loss": -0.001, "num_tokens": 15749710.0, "reward": 13.44559097290039, "reward_std": 1.4326897859573364, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8529207110404968, "rewards/length2tails_reward/std": 0.19226661324501038, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3273532390594482, "rewards/thermo_reward/std": 1.2326303720474243, "step": 1804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08045259164646268, "epoch": 3.61, "frac_reward_zero_std": 0.0, "grad_norm": 0.10184618085622787, "learning_rate": 3.725083010995611e-07, "loss": -0.0012, "num_tokens": 15758454.0, "reward": 13.835386276245117, "reward_std": 0.43626150488853455, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7821381688117981, "rewards/length2tails_reward/std": 0.2621596157550812, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.59375, "completions/mean_terminated_length": 272.59375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08251079404726624, "epoch": 3.612, "frac_reward_zero_std": 0.0, "grad_norm": 0.12113630771636963, "learning_rate": 3.715104021118763e-07, "loss": 0.0001, "num_tokens": 15767209.0, "reward": 13.416242599487305, "reward_std": 2.024634599685669, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.463491201400757, "rewards/kidney_reward/std": 0.7777802348136902, "rewards/length2tails_reward/mean": 0.7820241451263428, "rewards/length2tails_reward/std": 0.26788899302482605, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4133644104003906, "rewards/thermo_reward/std": 1.27640962600708, "step": 1806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 274.21875, "completions/mean_terminated_length": 274.21875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08606186509132385, "epoch": 3.614, "frac_reward_zero_std": 0.0, "grad_norm": 0.15508778393268585, "learning_rate": 3.705135365174197e-07, "loss": 0.0021, "num_tokens": 15776016.0, "reward": 13.849562644958496, "reward_std": 0.4784981906414032, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7986260652542114, "rewards/length2tails_reward/std": 0.26385998725891113, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 272.625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0820772061124444, "epoch": 3.616, "frac_reward_zero_std": 0.0, "grad_norm": 0.14850305020809174, "learning_rate": 3.6951770595528606e-07, "loss": 0.0002, "num_tokens": 15784772.0, "reward": 12.774524688720703, "reward_std": 4.888601303100586, "rewards/fitness_reward/mean": 6.999087333679199, "rewards/fitness_reward/std": 2.048335552215576, "rewards/kidney_reward/mean": 2.341603994369507, "rewards/kidney_reward/std": 1.1970475912094116, "rewards/length2tails_reward/mean": 0.8276439905166626, "rewards/length2tails_reward/std": 0.22841252386569977, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2510693073272705, "rewards/thermo_reward/std": 1.8503024578094482, "step": 1808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 272.84375, "completions/mean_terminated_length": 272.84375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08443187456578016, "epoch": 3.618, "frac_reward_zero_std": 0.0, "grad_norm": 0.1716132014989853, "learning_rate": 3.685229120628688e-07, "loss": 0.0012, "num_tokens": 15793535.0, "reward": 13.70191478729248, "reward_std": 0.8540881872177124, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7989389300346375, "rewards/length2tails_reward/std": 0.24277861416339874, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.534355878829956, "rewards/thermo_reward/std": 0.8508411049842834, "step": 1809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 270.65625, "completions/mean_terminated_length": 270.65625, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "entropy": 0.09030141588300467, "epoch": 3.62, "frac_reward_zero_std": 0.0, "grad_norm": 0.16818749904632568, "learning_rate": 3.6752915647585646e-07, "loss": 0.0007, "num_tokens": 15802228.0, "reward": 13.565706253051758, "reward_std": 1.2407891750335693, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7469499111175537, "rewards/length2tails_reward/std": 0.29119476675987244, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4307057857513428, "rewards/thermo_reward/std": 1.130438208580017, "step": 1810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.0625, "completions/mean_terminated_length": 273.0625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.07465990725904703, "epoch": 3.622, "frac_reward_zero_std": 0.0, "grad_norm": 0.08960974216461182, "learning_rate": 3.665364408282304e-07, "loss": 0.0034, "num_tokens": 15810998.0, "reward": 13.920299530029297, "reward_std": 0.31105008721351624, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8335549831390381, "rewards/length2tails_reward/std": 0.20239099860191345, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.90625, "completions/mean_terminated_length": 272.90625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0867516491562128, "epoch": 3.624, "frac_reward_zero_std": 0.0, "grad_norm": 0.07420986890792847, "learning_rate": 3.6554476675226156e-07, "loss": 0.0002, "num_tokens": 15819763.0, "reward": 13.589237213134766, "reward_std": 1.4417500495910645, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8086227774620056, "rewards/length2tails_reward/std": 0.24868273735046387, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5055789947509766, "rewards/thermo_reward/std": 0.9949191212654114, "step": 1812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.21875, "completions/mean_terminated_length": 271.21875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08404396940022707, "epoch": 3.626, "frac_reward_zero_std": 0.0, "grad_norm": 0.11983322352170944, "learning_rate": 3.6455413587850926e-07, "loss": -0.003, "num_tokens": 15828474.0, "reward": 13.836545944213867, "reward_std": 0.8850547075271606, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7509810924530029, "rewards/length2tails_reward/std": 0.19866718351840973, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.673783779144287, "rewards/thermo_reward/std": 0.8783318996429443, "step": 1813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.09375, "completions/mean_terminated_length": 273.09375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.07894755946472287, "epoch": 3.628, "frac_reward_zero_std": 0.0, "grad_norm": 0.14280594885349274, "learning_rate": 3.6356454983581695e-07, "loss": 0.0019, "num_tokens": 15837245.0, "reward": 13.78122329711914, "reward_std": 0.5930385589599609, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.787685215473175, "rewards/length2tails_reward/std": 0.26022645831108093, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.34375, "completions/mean_terminated_length": 273.34375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.0844585420563817, "epoch": 3.63, "frac_reward_zero_std": 0.0, "grad_norm": 0.09027701616287231, "learning_rate": 3.625760102513102e-07, "loss": -0.0016, "num_tokens": 15846024.0, "reward": 13.760713577270508, "reward_std": 0.5119152069091797, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8331196904182434, "rewards/length2tails_reward/std": 0.20415940880775452, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061467885971069, "step": 1815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.1875, "completions/mean_terminated_length": 273.1875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08118011150509119, "epoch": 3.632, "frac_reward_zero_std": 0.0, "grad_norm": 0.08081858605146408, "learning_rate": 3.6158851875039456e-07, "loss": -0.003, "num_tokens": 15854798.0, "reward": 13.591593742370605, "reward_std": 1.0339016914367676, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7975776195526123, "rewards/length2tails_reward/std": 0.2879304885864258, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4515304565429688, "rewards/thermo_reward/std": 0.8939418792724609, "step": 1816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.125, "completions/mean_terminated_length": 273.125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.0785574521869421, "epoch": 3.634, "frac_reward_zero_std": 0.0, "grad_norm": 0.16642813384532928, "learning_rate": 3.606020769567507e-07, "loss": 0.0061, "num_tokens": 15863570.0, "reward": 13.730648040771484, "reward_std": 0.5763334035873413, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8060550093650818, "rewards/length2tails_reward/std": 0.2286890745162964, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 1817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 271.84375, "completions/mean_terminated_length": 271.84375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08207452390342951, "epoch": 3.636, "frac_reward_zero_std": 0.0, "grad_norm": 0.1245543584227562, "learning_rate": 3.596166864923348e-07, "loss": -0.0038, "num_tokens": 15872301.0, "reward": 13.270090103149414, "reward_std": 3.044645071029663, "rewards/fitness_reward/mean": 7.052626132965088, "rewards/fitness_reward/std": 1.7454723119735718, "rewards/kidney_reward/mean": 2.5107221603393555, "rewards/kidney_reward/std": 0.5186686515808105, "rewards/length2tails_reward/mean": 0.7430772185325623, "rewards/length2tails_reward/std": 0.29433321952819824, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.53243350982666, "rewards/thermo_reward/std": 0.8602582216262817, "step": 1818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.28125, "completions/mean_terminated_length": 272.28125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.085698701441288, "epoch": 3.638, "frac_reward_zero_std": 0.0, "grad_norm": 0.3150479793548584, "learning_rate": 3.586323489773739e-07, "loss": -0.001, "num_tokens": 15881046.0, "reward": 13.878179550170898, "reward_std": 0.3804786503314972, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8112087845802307, "rewards/length2tails_reward/std": 0.21879929304122925, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.21875, "completions/mean_terminated_length": 273.21875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09197020716965199, "epoch": 3.64, "frac_reward_zero_std": 0.0, "grad_norm": 0.16853182017803192, "learning_rate": 3.576490660303636e-07, "loss": -0.0029, "num_tokens": 15889821.0, "reward": 13.605411529541016, "reward_std": 1.368744969367981, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.825190544128418, "rewards/length2tails_reward/std": 0.22926637530326843, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4625871181488037, "rewards/thermo_reward/std": 1.2189743518829346, "step": 1820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.78125, "completions/mean_terminated_length": 272.78125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08115828037261963, "epoch": 3.642, "frac_reward_zero_std": 0.0, "grad_norm": 0.08074554055929184, "learning_rate": 3.566668392680662e-07, "loss": -0.0062, "num_tokens": 15898582.0, "reward": 13.297192573547363, "reward_std": 2.0624260902404785, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8180257678031921, "rewards/length2tails_reward/std": 0.22408005595207214, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1550843715667725, "rewards/thermo_reward/std": 1.9389499425888062, "step": 1821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.625, "completions/mean_terminated_length": 271.625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0853598746471107, "epoch": 3.644, "frac_reward_zero_std": 0.0, "grad_norm": 0.08734510093927383, "learning_rate": 3.5568567030550577e-07, "loss": -0.0006, "num_tokens": 15907306.0, "reward": 13.05596923828125, "reward_std": 2.8374533653259277, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.4475936889648438, "rewards/kidney_reward/std": 0.8660976886749268, "rewards/length2tails_reward/mean": 0.7015992403030396, "rewards/length2tails_reward/std": 0.31296905875205994, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1345396041870117, "rewards/thermo_reward/std": 1.8032125234603882, "step": 1822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08595753367990255, "epoch": 3.646, "frac_reward_zero_std": 0.0, "grad_norm": 0.10818291455507278, "learning_rate": 3.547055607559688e-07, "loss": 0.0033, "num_tokens": 15916050.0, "reward": 13.50337028503418, "reward_std": 1.7344375848770142, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.530332565307617, "rewards/kidney_reward/std": 0.5438900589942932, "rewards/length2tails_reward/mean": 0.7964829802513123, "rewards/length2tails_reward/std": 0.2236173003911972, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.432204484939575, "rewards/thermo_reward/std": 1.280646800994873, "step": 1823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.65625, "completions/mean_terminated_length": 272.65625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08719247533008456, "epoch": 3.648, "frac_reward_zero_std": 0.0, "grad_norm": 0.43279126286506653, "learning_rate": 3.5372651223099915e-07, "loss": -0.002, "num_tokens": 15924807.0, "reward": 12.916872024536133, "reward_std": 4.81889533996582, "rewards/fitness_reward/mean": 6.978363990783691, "rewards/fitness_reward/std": 2.165562629699707, "rewards/kidney_reward/mean": 2.306910991668701, "rewards/kidney_reward/std": 1.3264926671981812, "rewards/length2tails_reward/mean": 0.7791447043418884, "rewards/length2tails_reward/std": 0.2675744891166687, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4536831378936768, "rewards/thermo_reward/std": 1.473813772201538, "step": 1824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.6875, "completions/mean_terminated_length": 272.6875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08192221820354462, "epoch": 3.65, "frac_reward_zero_std": 0.0, "grad_norm": 0.09442069381475449, "learning_rate": 3.52748526340396e-07, "loss": 0.0041, "num_tokens": 15933565.0, "reward": 13.876182556152344, "reward_std": 0.3726004660129547, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7912297248840332, "rewards/length2tails_reward/std": 0.27527180314064026, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.25, "completions/mean_terminated_length": 273.25, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08800566708669066, "epoch": 3.652, "frac_reward_zero_std": 0.0, "grad_norm": 0.15255124866962433, "learning_rate": 3.5177160469221176e-07, "loss": 0.0039, "num_tokens": 15942341.0, "reward": 13.62850284576416, "reward_std": 1.0358679294586182, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8195139169692993, "rewards/length2tails_reward/std": 0.21823741495609283, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4862453937530518, "rewards/thermo_reward/std": 0.9058089852333069, "step": 1826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.4375, "completions/mean_terminated_length": 271.4375, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "entropy": 0.09326956886798143, "epoch": 3.654, "frac_reward_zero_std": 0.0, "grad_norm": 0.14029085636138916, "learning_rate": 3.50795748892748e-07, "loss": -0.0016, "num_tokens": 15951059.0, "reward": 13.559985160827637, "reward_std": 1.790492296218872, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5264534950256348, "rewards/kidney_reward/std": 0.5658333897590637, "rewards/length2tails_reward/mean": 0.7611898183822632, "rewards/length2tails_reward/std": 0.2647976577281952, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.496227741241455, "rewards/thermo_reward/std": 1.2426968812942505, "step": 1827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 271.28125, "completions/mean_terminated_length": 271.28125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "entropy": 0.08237532759085298, "epoch": 3.656, "frac_reward_zero_std": 0.0, "grad_norm": 0.08693405240774155, "learning_rate": 3.4982096054655477e-07, "loss": -0.0036, "num_tokens": 15959772.0, "reward": 13.295555114746094, "reward_std": 3.090214252471924, "rewards/fitness_reward/mean": 6.941235542297363, "rewards/fitness_reward/std": 2.0655312538146973, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7664594650268555, "rewards/length2tails_reward/std": 0.3026435375213623, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.605912208557129, "rewards/thermo_reward/std": 0.8559578061103821, "step": 1828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.0625, "completions/mean_terminated_length": 272.0625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08308725710958242, "epoch": 3.658, "frac_reward_zero_std": 0.0, "grad_norm": 0.23046323657035828, "learning_rate": 3.488472412564264e-07, "loss": 0.0061, "num_tokens": 15968510.0, "reward": 13.662422180175781, "reward_std": 1.0406625270843506, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7909877300262451, "rewards/length2tails_reward/std": 0.2367888242006302, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5230178833007812, "rewards/thermo_reward/std": 0.9068526029586792, "step": 1829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.5, "completions/mean_terminated_length": 271.5, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.07731601735576987, "epoch": 3.66, "frac_reward_zero_std": 0.0, "grad_norm": 0.24017341434955597, "learning_rate": 3.478745926233998e-07, "loss": -0.0039, "num_tokens": 15977230.0, "reward": 13.384349822998047, "reward_std": 3.007572889328003, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.5390896797180176, "rewards/kidney_reward/std": 0.49435171484947205, "rewards/length2tails_reward/mean": 0.7631849050521851, "rewards/length2tails_reward/std": 0.2363831102848053, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.6158881187438965, "rewards/thermo_reward/std": 0.8037141561508179, "step": 1830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.59375, "completions/mean_terminated_length": 271.59375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08287757448852062, "epoch": 3.662, "frac_reward_zero_std": 0.0, "grad_norm": 0.07217924296855927, "learning_rate": 3.4690301624675123e-07, "loss": -0.0047, "num_tokens": 15985953.0, "reward": 13.908952713012695, "reward_std": 0.31878843903541565, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.720072865486145, "rewards/length2tails_reward/std": 0.30547112226486206, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.625, "completions/mean_terminated_length": 271.625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.07862121984362602, "epoch": 3.664, "frac_reward_zero_std": 0.0, "grad_norm": 0.09833700209856033, "learning_rate": 3.4593251372399414e-07, "loss": 0.0056, "num_tokens": 15994677.0, "reward": 13.86894416809082, "reward_std": 0.3705652952194214, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7188472151756287, "rewards/length2tails_reward/std": 0.26971179246902466, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.84375, "completions/mean_terminated_length": 272.84375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.07587908860296011, "epoch": 3.666, "frac_reward_zero_std": 0.0, "grad_norm": 0.09324633330106735, "learning_rate": 3.449630866508757e-07, "loss": -0.0022, "num_tokens": 16003440.0, "reward": 13.91788101196289, "reward_std": 0.3153970539569855, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.809357762336731, "rewards/length2tails_reward/std": 0.21493566036224365, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.59375, "completions/mean_terminated_length": 271.59375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.07948243711143732, "epoch": 3.668, "frac_reward_zero_std": 0.0, "grad_norm": 0.12330475449562073, "learning_rate": 3.4399473662137514e-07, "loss": -0.0001, "num_tokens": 16012163.0, "reward": 13.845161437988281, "reward_std": 0.4772983491420746, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7546184062957764, "rewards/length2tails_reward/std": 0.2744663953781128, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.59375, "completions/mean_terminated_length": 271.59375, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "entropy": 0.08797703590244055, "epoch": 3.67, "frac_reward_zero_std": 0.0, "grad_norm": 0.1077851727604866, "learning_rate": 3.4302746522770076e-07, "loss": 0.0025, "num_tokens": 16020886.0, "reward": 13.835163116455078, "reward_std": 0.42623722553253174, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7799012064933777, "rewards/length2tails_reward/std": 0.2840479612350464, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 274.96875, "completions/mean_terminated_length": 274.96875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08596474630758166, "epoch": 3.672, "frac_reward_zero_std": 0.0, "grad_norm": 0.1660880595445633, "learning_rate": 3.420612740602874e-07, "loss": -0.0045, "num_tokens": 16029717.0, "reward": 13.80670166015625, "reward_std": 0.5210345983505249, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7688755989074707, "rewards/length2tails_reward/std": 0.2551930248737335, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 0.08587139938026667, "epoch": 3.674, "frac_reward_zero_std": 0.0, "grad_norm": 0.12363027781248093, "learning_rate": 3.410961647077939e-07, "loss": 0.0009, "num_tokens": 16038465.0, "reward": 13.799466133117676, "reward_std": 0.47291746735572815, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.821791410446167, "rewards/length2tails_reward/std": 0.20731790363788605, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.75, "completions/mean_terminated_length": 272.75, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.07770541356876493, "epoch": 3.676, "frac_reward_zero_std": 0.0, "grad_norm": 0.06414253264665604, "learning_rate": 3.401321387571001e-07, "loss": -0.0052, "num_tokens": 16047225.0, "reward": 13.58360481262207, "reward_std": 2.1439197063446045, "rewards/fitness_reward/mean": 7.052915573120117, "rewards/fitness_reward/std": 1.7438353300094604, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.822883129119873, "rewards/length2tails_reward/std": 0.2441834658384323, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.6875, "completions/mean_terminated_length": 272.6875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.0891623767092824, "epoch": 3.678, "frac_reward_zero_std": 0.0, "grad_norm": 0.1793835461139679, "learning_rate": 3.39169197793304e-07, "loss": -0.0042, "num_tokens": 16055983.0, "reward": 13.732637405395508, "reward_std": 0.5838374495506287, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8259539604187012, "rewards/length2tails_reward/std": 0.24140463769435883, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 1839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.03125, "completions/mean_terminated_length": 273.03125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.09458326548337936, "epoch": 3.68, "frac_reward_zero_std": 0.0, "grad_norm": 0.1907602846622467, "learning_rate": 3.3820734339972036e-07, "loss": 0.0045, "num_tokens": 16064752.0, "reward": 13.851795196533203, "reward_std": 0.4789268374443054, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8209490776062012, "rewards/length2tails_reward/std": 0.2316625416278839, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 274.4375, "completions/mean_terminated_length": 274.4375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.0811676699668169, "epoch": 3.682, "frac_reward_zero_std": 0.0, "grad_norm": 0.0921485498547554, "learning_rate": 3.372465771578771e-07, "loss": 0.0031, "num_tokens": 16073566.0, "reward": 13.846429824829102, "reward_std": 0.42756423354148865, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8925660252571106, "rewards/length2tails_reward/std": 0.1536208838224411, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.3125, "completions/mean_terminated_length": 272.3125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08341225422918797, "epoch": 3.684, "frac_reward_zero_std": 0.0, "grad_norm": 0.10851159691810608, "learning_rate": 3.362869006475126e-07, "loss": 0.0031, "num_tokens": 16082312.0, "reward": 13.754069328308105, "reward_std": 0.5051019787788391, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7666773796081543, "rewards/length2tails_reward/std": 0.303233802318573, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 1842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.78125, "completions/mean_terminated_length": 271.78125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08298153709620237, "epoch": 3.686, "frac_reward_zero_std": 0.0, "grad_norm": 0.13726581633090973, "learning_rate": 3.3532831544657456e-07, "loss": 0.0009, "num_tokens": 16091041.0, "reward": 13.767756462097168, "reward_std": 0.5509318113327026, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.778285562992096, "rewards/length2tails_reward/std": 0.24406449496746063, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.21875, "completions/mean_terminated_length": 273.21875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.0828853053972125, "epoch": 3.6879999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.070669025182724, "learning_rate": 3.3437082313121447e-07, "loss": 0.0032, "num_tokens": 16099816.0, "reward": 13.959261894226074, "reward_std": 0.22338415682315826, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8243112564086914, "rewards/length2tails_reward/std": 0.20171445608139038, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7891664505004883, "rewards/thermo_reward/std": 0.225629061460495, "step": 1844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 272.625, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 0.088692725636065, "epoch": 3.69, "frac_reward_zero_std": 0.0, "grad_norm": 0.06675875186920166, "learning_rate": 3.3341442527578835e-07, "loss": -0.0004, "num_tokens": 16108572.0, "reward": 13.817087173461914, "reward_std": 0.8259702324867249, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8217202425003052, "rewards/length2tails_reward/std": 0.2654772996902466, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.6472504138946533, "rewards/thermo_reward/std": 0.8268661499023438, "step": 1845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.90625, "completions/mean_terminated_length": 273.90625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08372374624013901, "epoch": 3.692, "frac_reward_zero_std": 0.0, "grad_norm": 0.08932638168334961, "learning_rate": 3.3245912345285197e-07, "loss": -0.0023, "num_tokens": 16117369.0, "reward": 13.630112648010254, "reward_std": 1.149518370628357, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8636769652366638, "rewards/length2tails_reward/std": 0.16939999163150787, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.510798454284668, "rewards/thermo_reward/std": 0.9683473706245422, "step": 1846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.59375, "completions/mean_terminated_length": 272.59375, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "entropy": 0.08410823810845613, "epoch": 3.694, "frac_reward_zero_std": 0.0, "grad_norm": 0.17860926687717438, "learning_rate": 3.315049192331595e-07, "loss": -0.0142, "num_tokens": 16126124.0, "reward": 13.166458129882812, "reward_std": 2.05751371383667, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8351097702980042, "rewards/length2tails_reward/std": 0.23980149626731873, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.050001621246338, "rewards/thermo_reward/std": 1.8831400871276855, "step": 1847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.1875, "completions/mean_terminated_length": 273.1875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08229447342455387, "epoch": 3.6959999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.09073762595653534, "learning_rate": 3.3055181418566e-07, "loss": -0.0009, "num_tokens": 16134898.0, "reward": 13.835367202758789, "reward_std": 0.4329942762851715, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7819371223449707, "rewards/length2tails_reward/std": 0.26615023612976074, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 273.53125, "completions/mean_terminated_length": 273.53125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08659685775637627, "epoch": 3.698, "frac_reward_zero_std": 0.0, "grad_norm": 0.47516101598739624, "learning_rate": 3.2959980987749483e-07, "loss": 0.0124, "num_tokens": 16143683.0, "reward": 13.798820495605469, "reward_std": 0.46693822741508484, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8153313994407654, "rewards/length2tails_reward/std": 0.24201750755310059, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.96875, "completions/mean_terminated_length": 271.96875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.07372273737564683, "epoch": 3.7, "frac_reward_zero_std": 0.0, "grad_norm": 0.049018096178770065, "learning_rate": 3.28648907873996e-07, "loss": -0.0049, "num_tokens": 16152418.0, "reward": 13.620929718017578, "reward_std": 1.8910012245178223, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5256857872009277, "rewards/kidney_reward/std": 0.5701754689216614, "rewards/length2tails_reward/mean": 0.7492478489875793, "rewards/length2tails_reward/std": 0.271221786737442, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.559133529663086, "rewards/thermo_reward/std": 1.3134887218475342, "step": 1850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.78125, "completions/mean_terminated_length": 272.78125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.07341090962290764, "epoch": 3.702, "frac_reward_zero_std": 0.0, "grad_norm": 0.10206151008605957, "learning_rate": 3.276991097386831e-07, "loss": -0.0069, "num_tokens": 16161179.0, "reward": 13.129651069641113, "reward_std": 2.6763594150543213, "rewards/fitness_reward/mean": 6.988076210021973, "rewards/fitness_reward/std": 2.110623359680176, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.767514169216156, "rewards/length2tails_reward/std": 0.30354535579681396, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.393061637878418, "rewards/thermo_reward/std": 0.99179607629776, "step": 1851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.07788072619587183, "epoch": 3.7039999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.10819029062986374, "learning_rate": 3.2675041703326046e-07, "loss": -0.0037, "num_tokens": 16169927.0, "reward": 13.877944946289062, "reward_std": 0.38620296120643616, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8088620901107788, "rewards/length2tails_reward/std": 0.21891973912715912, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.1875, "completions/mean_terminated_length": 271.1875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.07938122935593128, "epoch": 3.706, "frac_reward_zero_std": 0.0, "grad_norm": 0.07176394760608673, "learning_rate": 3.258028313176151e-07, "loss": -0.0005, "num_tokens": 16178637.0, "reward": 13.868053436279297, "reward_std": 0.3806667625904083, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7099429368972778, "rewards/length2tails_reward/std": 0.28471970558166504, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.0, "completions/mean_terminated_length": 273.0, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.08443743642419577, "epoch": 3.708, "frac_reward_zero_std": 0.0, "grad_norm": 0.11585229635238647, "learning_rate": 3.2485635414981315e-07, "loss": -0.0001, "num_tokens": 16187405.0, "reward": 13.549070358276367, "reward_std": 1.1941407918930054, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8482940196990967, "rewards/length2tails_reward/std": 0.22004073858261108, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4312944412231445, "rewards/thermo_reward/std": 1.127699613571167, "step": 1854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.46875, "completions/mean_terminated_length": 271.46875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08564404863864183, "epoch": 3.71, "frac_reward_zero_std": 0.0, "grad_norm": 0.12348084151744843, "learning_rate": 3.2391098708609897e-07, "loss": -0.001, "num_tokens": 16196124.0, "reward": 12.726795196533203, "reward_std": 4.499066352844238, "rewards/fitness_reward/mean": 7.01171875, "rewards/fitness_reward/std": 1.9768810272216797, "rewards/kidney_reward/mean": 2.339609146118164, "rewards/kidney_reward/std": 1.1481411457061768, "rewards/length2tails_reward/mean": 0.7159409523010254, "rewards/length2tails_reward/std": 0.32566535472869873, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.203874111175537, "rewards/thermo_reward/std": 1.7565038204193115, "step": 1855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 271.09375, "completions/mean_terminated_length": 271.09375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0838960474357009, "epoch": 3.7119999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.11009681224822998, "learning_rate": 3.229667316808907e-07, "loss": -0.0031, "num_tokens": 16204831.0, "reward": 13.504732131958008, "reward_std": 2.51141619682312, "rewards/fitness_reward/mean": 6.986984729766846, "rewards/fitness_reward/std": 2.116797685623169, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.6934654116630554, "rewards/length2tails_reward/std": 0.2892797291278839, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09317321423441172, "epoch": 3.714, "frac_reward_zero_std": 0.0, "grad_norm": 0.10668087750673294, "learning_rate": 3.220235894867793e-07, "loss": 0.0006, "num_tokens": 16213575.0, "reward": 13.71420955657959, "reward_std": 0.535295844078064, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7669464349746704, "rewards/length2tails_reward/std": 0.3002374470233917, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5498507022857666, "rewards/thermo_reward/std": 0.5360844731330872, "step": 1857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.53125, "completions/mean_terminated_length": 272.53125, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "entropy": 0.07743867486715317, "epoch": 3.716, "frac_reward_zero_std": 0.0, "grad_norm": 0.12143448740243912, "learning_rate": 3.2108156205452506e-07, "loss": 0.0044, "num_tokens": 16222328.0, "reward": 13.630332946777344, "reward_std": 1.0454219579696655, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8501394987106323, "rewards/length2tails_reward/std": 0.2279629111289978, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4850144386291504, "rewards/thermo_reward/std": 0.9117720723152161, "step": 1858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.78125, "completions/mean_terminated_length": 271.78125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.07970911916345358, "epoch": 3.718, "frac_reward_zero_std": 0.0, "grad_norm": 0.09778325259685516, "learning_rate": 3.2014065093305564e-07, "loss": -0.0011, "num_tokens": 16231057.0, "reward": 13.690130233764648, "reward_std": 1.086830973625183, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7724133133888245, "rewards/length2tails_reward/std": 0.2743414640426636, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5525827407836914, "rewards/thermo_reward/std": 0.9439334273338318, "step": 1859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 271.3125, "completions/mean_terminated_length": 271.3125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08170908875763416, "epoch": 3.7199999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.1340896338224411, "learning_rate": 3.19200857669463e-07, "loss": 0.0023, "num_tokens": 16239771.0, "reward": 13.950653076171875, "reward_std": 0.22248882055282593, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.738226056098938, "rewards/length2tails_reward/std": 0.25630003213882446, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7891664505004883, "rewards/thermo_reward/std": 0.225629061460495, "step": 1860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.125, "completions/mean_terminated_length": 272.125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.07605164684355259, "epoch": 3.722, "frac_reward_zero_std": 0.0, "grad_norm": 0.9549767971038818, "learning_rate": 3.182621838090006e-07, "loss": -0.0006, "num_tokens": 16248511.0, "reward": 12.998878479003906, "reward_std": 3.2146313190460205, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.452592670917511, "rewards/kidney_reward/mean": 2.349088191986084, "rewards/kidney_reward/std": 1.102823257446289, "rewards/length2tails_reward/mean": 0.7339838743209839, "rewards/length2tails_reward/std": 0.3027809262275696, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2302253246307373, "rewards/thermo_reward/std": 1.777825117111206, "step": 1861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.5625, "completions/mean_terminated_length": 272.5625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0779388127848506, "epoch": 3.724, "frac_reward_zero_std": 0.0, "grad_norm": 0.11938080191612244, "learning_rate": 3.17324630895082e-07, "loss": -0.0008, "num_tokens": 16257265.0, "reward": 13.55603313446045, "reward_std": 1.6116957664489746, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5346274375915527, "rewards/kidney_reward/std": 0.519594669342041, "rewards/length2tails_reward/mean": 0.7872436046600342, "rewards/length2tails_reward/std": 0.26303040981292725, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4814963340759277, "rewards/thermo_reward/std": 1.1194443702697754, "step": 1862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.21875, "completions/mean_terminated_length": 272.21875, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "entropy": 0.08677832735702395, "epoch": 3.726, "frac_reward_zero_std": 0.0, "grad_norm": 0.19907931983470917, "learning_rate": 3.163882004692774e-07, "loss": 0.0037, "num_tokens": 16266008.0, "reward": 13.88401985168457, "reward_std": 0.37567073106765747, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8696078062057495, "rewards/length2tails_reward/std": 0.174245685338974, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 272.625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0809440454468131, "epoch": 3.7279999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.06956552714109421, "learning_rate": 3.154528940713113e-07, "loss": -0.0064, "num_tokens": 16274764.0, "reward": 13.794866561889648, "reward_std": 1.1583558320999146, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8030489087104797, "rewards/length2tails_reward/std": 0.23350834846496582, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.654256820678711, "rewards/thermo_reward/std": 0.9887924194335938, "step": 1864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.5625, "completions/mean_terminated_length": 271.5625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.08229780849069357, "epoch": 3.73, "frac_reward_zero_std": 0.0, "grad_norm": 0.110500268638134, "learning_rate": 3.145187132390604e-07, "loss": 0.0032, "num_tokens": 16283486.0, "reward": 13.691943168640137, "reward_std": 0.9804035425186157, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7836110591888428, "rewards/length2tails_reward/std": 0.27624985575675964, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.553276300430298, "rewards/thermo_reward/std": 0.9403393864631653, "step": 1865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 0.08790844492614269, "epoch": 3.732, "frac_reward_zero_std": 0.0, "grad_norm": 0.07562534511089325, "learning_rate": 3.1358565950854976e-07, "loss": 0.0002, "num_tokens": 16292234.0, "reward": 13.780458450317383, "reward_std": 0.8375572562217712, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8365454077720642, "rewards/length2tails_reward/std": 0.2498382180929184, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.609139919281006, "rewards/thermo_reward/std": 0.8389954566955566, "step": 1866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.40625, "completions/mean_terminated_length": 271.40625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08745685964822769, "epoch": 3.734, "frac_reward_zero_std": 0.0, "grad_norm": 0.07099781185388565, "learning_rate": 3.12653734413952e-07, "loss": 0.0009, "num_tokens": 16300951.0, "reward": 13.770774841308594, "reward_std": 0.8302472233772278, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7253307104110718, "rewards/length2tails_reward/std": 0.27975571155548096, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.61057710647583, "rewards/thermo_reward/std": 0.8314604163169861, "step": 1867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.125, "completions/mean_terminated_length": 273.125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08242936944589019, "epoch": 3.7359999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.139634907245636, "learning_rate": 3.11722939487584e-07, "loss": -0.0, "num_tokens": 16309723.0, "reward": 13.959595680236816, "reward_std": 0.22406338155269623, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8276500701904297, "rewards/length2tails_reward/std": 0.21905958652496338, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7891664505004883, "rewards/thermo_reward/std": 0.225629061460495, "step": 1868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.59375, "completions/mean_terminated_length": 272.59375, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "entropy": 0.08443598542362452, "epoch": 3.738, "frac_reward_zero_std": 0.0, "grad_norm": 0.13019409775733948, "learning_rate": 3.1079327625990403e-07, "loss": 0.0008, "num_tokens": 16318478.0, "reward": 13.960803985595703, "reward_std": 0.22400137782096863, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.839726448059082, "rewards/length2tails_reward/std": 0.18779398500919342, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7891664505004883, "rewards/thermo_reward/std": 0.225629061460495, "step": 1869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 274.34375, "completions/mean_terminated_length": 274.34375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08120616152882576, "epoch": 3.74, "frac_reward_zero_std": 0.0, "grad_norm": 0.09351641684770584, "learning_rate": 3.098647462595099e-07, "loss": -0.001, "num_tokens": 16327289.0, "reward": 13.639875411987305, "reward_std": 0.9192527532577515, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8695728182792664, "rewards/length2tails_reward/std": 0.20318247377872467, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.492612838745117, "rewards/thermo_reward/std": 0.8752034902572632, "step": 1870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.5625, "completions/mean_terminated_length": 271.5625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08450864860787988, "epoch": 3.742, "frac_reward_zero_std": 0.0, "grad_norm": 0.08416204154491425, "learning_rate": 3.0893735101313535e-07, "loss": -0.0036, "num_tokens": 16336011.0, "reward": 13.434139251708984, "reward_std": 1.6701217889785767, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7553468346595764, "rewards/length2tails_reward/std": 0.2860553562641144, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.2982993125915527, "rewards/thermo_reward/std": 1.5283452272415161, "step": 1871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.84375, "completions/mean_terminated_length": 273.84375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08435946051031351, "epoch": 3.7439999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.09009895473718643, "learning_rate": 3.0801109204564926e-07, "loss": -0.0041, "num_tokens": 16344806.0, "reward": 13.561756134033203, "reward_std": 1.422103762626648, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8623050451278687, "rewards/length2tails_reward/std": 0.20491470396518707, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.415219783782959, "rewards/thermo_reward/std": 1.2666558027267456, "step": 1872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.46875, "completions/mean_terminated_length": 273.46875, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "entropy": 0.08299027197062969, "epoch": 3.746, "frac_reward_zero_std": 0.0, "grad_norm": 0.08488171547651291, "learning_rate": 3.070859708800515e-07, "loss": 0.0029, "num_tokens": 16353589.0, "reward": 13.885295867919922, "reward_std": 0.37565287947654724, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8823668360710144, "rewards/length2tails_reward/std": 0.18453270196914673, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.875, "completions/mean_terminated_length": 271.875, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "entropy": 0.08089287113398314, "epoch": 3.748, "frac_reward_zero_std": 0.0, "grad_norm": 0.10338589549064636, "learning_rate": 3.0616198903747157e-07, "loss": 0.0039, "num_tokens": 16362321.0, "reward": 13.87789535522461, "reward_std": 0.3744944632053375, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8083574771881104, "rewards/length2tails_reward/std": 0.20511792600154877, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.125, "completions/mean_terminated_length": 272.125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0791722135618329, "epoch": 3.75, "frac_reward_zero_std": 0.0, "grad_norm": 0.11482106894254684, "learning_rate": 3.0523914803716577e-07, "loss": 0.0039, "num_tokens": 16371061.0, "reward": 13.954635620117188, "reward_std": 0.2227708101272583, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7780531048774719, "rewards/length2tails_reward/std": 0.2267376035451889, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7891664505004883, "rewards/thermo_reward/std": 0.225629061460495, "step": 1875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.0625, "completions/mean_terminated_length": 272.0625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08384049450978637, "epoch": 3.752, "frac_reward_zero_std": 0.0, "grad_norm": 0.09511395543813705, "learning_rate": 3.043174493965136e-07, "loss": -0.0002, "num_tokens": 16379799.0, "reward": 13.39472770690918, "reward_std": 2.6052229404449463, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.4486875534057617, "rewards/kidney_reward/std": 0.860011100769043, "rewards/length2tails_reward/mean": 0.7839442491531372, "rewards/length2tails_reward/std": 0.25025689601898193, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4639697074890137, "rewards/thermo_reward/std": 1.4808531999588013, "step": 1876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09349369630217552, "epoch": 3.754, "frac_reward_zero_std": 0.0, "grad_norm": 0.05819488316774368, "learning_rate": 3.0339689463101714e-07, "loss": -0.0045, "num_tokens": 16388543.0, "reward": 13.307064056396484, "reward_std": 3.007098436355591, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.5390896797180176, "rewards/kidney_reward/std": 0.49435171484947205, "rewards/length2tails_reward/mean": 0.7880429625511169, "rewards/length2tails_reward/std": 0.24534600973129272, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.536116123199463, "rewards/thermo_reward/std": 0.8422486782073975, "step": 1877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.5625, "completions/mean_terminated_length": 272.5625, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 0.10010077618062496, "epoch": 3.7560000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.7483545541763306, "learning_rate": 3.0247748525429785e-07, "loss": -0.0078, "num_tokens": 16397297.0, "reward": 13.707642555236816, "reward_std": 0.6473450064659119, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.8495962619781494, "rewards/length2tails_reward/std": 0.16216562688350677, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 1878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.96875, "completions/mean_terminated_length": 273.96875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.0832118820399046, "epoch": 3.758, "frac_reward_zero_std": 0.0, "grad_norm": 0.10160697251558304, "learning_rate": 3.0155922277809256e-07, "loss": 0.0002, "num_tokens": 16406096.0, "reward": 13.88062858581543, "reward_std": 0.37648022174835205, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8356945514678955, "rewards/length2tails_reward/std": 0.2622833847999573, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 273.46875, "completions/mean_terminated_length": 273.46875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.09201773721724749, "epoch": 3.76, "frac_reward_zero_std": 0.0, "grad_norm": 0.10607636719942093, "learning_rate": 3.006421087122538e-07, "loss": 0.0027, "num_tokens": 16414879.0, "reward": 13.839288711547852, "reward_std": 0.4233863949775696, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.821160078048706, "rewards/length2tails_reward/std": 0.23787914216518402, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.90625, "completions/mean_terminated_length": 269.90625, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.08618500549346209, "epoch": 3.762, "frac_reward_zero_std": 0.0, "grad_norm": 0.11446670442819595, "learning_rate": 2.9972614456474533e-07, "loss": -0.0015, "num_tokens": 16423548.0, "reward": 13.815107345581055, "reward_std": 0.9864696264266968, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7166304588317871, "rewards/length2tails_reward/std": 0.2847166359424591, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.6557796001434326, "rewards/thermo_reward/std": 0.9801791906356812, "step": 1881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.9375, "completions/mean_terminated_length": 273.9375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08535457588732243, "epoch": 3.7640000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.1087120845913887, "learning_rate": 2.9881133184163944e-07, "loss": 0.0012, "num_tokens": 16432346.0, "reward": 13.84271240234375, "reward_std": 0.42505595088005066, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8553938865661621, "rewards/length2tails_reward/std": 0.20128926634788513, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 273.28125, "completions/mean_terminated_length": 273.28125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08107516821473837, "epoch": 3.766, "frac_reward_zero_std": 0.0, "grad_norm": 3.0713000297546387, "learning_rate": 2.978976720471161e-07, "loss": 0.0098, "num_tokens": 16441123.0, "reward": 13.712474822998047, "reward_std": 1.392941951751709, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8027397990226746, "rewards/length2tails_reward/std": 0.23403459787368774, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5718953609466553, "rewards/thermo_reward/std": 1.2424302101135254, "step": 1883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.46875, "completions/mean_terminated_length": 272.46875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08162663597613573, "epoch": 3.768, "frac_reward_zero_std": 0.0, "grad_norm": 0.08860152959823608, "learning_rate": 2.969851666834594e-07, "loss": -0.0009, "num_tokens": 16449874.0, "reward": 13.41884994506836, "reward_std": 1.3702086210250854, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7931690216064453, "rewards/length2tails_reward/std": 0.23420800268650055, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.279226779937744, "rewards/thermo_reward/std": 1.2637813091278076, "step": 1884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.5625, "completions/mean_terminated_length": 271.5625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.09294078592211008, "epoch": 3.77, "frac_reward_zero_std": 0.0, "grad_norm": 0.45369336009025574, "learning_rate": 2.9607381725105507e-07, "loss": -0.0213, "num_tokens": 16458596.0, "reward": 13.53420639038086, "reward_std": 1.4441404342651367, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.8011078834533691, "rewards/length2tails_reward/std": 0.2844718396663666, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4485087394714355, "rewards/thermo_reward/std": 1.2097461223602295, "step": 1885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.53125, "completions/mean_terminated_length": 272.53125, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "entropy": 0.08244166569784284, "epoch": 3.7720000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.12649722397327423, "learning_rate": 2.9516362524838847e-07, "loss": -0.0007, "num_tokens": 16467349.0, "reward": 13.838593482971191, "reward_std": 0.43084508180618286, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.814199686050415, "rewards/length2tails_reward/std": 0.2507236897945404, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 276.78125, "completions/mean_terminated_length": 276.78125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09738295339047909, "epoch": 3.774, "frac_reward_zero_std": 0.0, "grad_norm": 0.08789598941802979, "learning_rate": 2.942545921720412e-07, "loss": -0.0094, "num_tokens": 16476238.0, "reward": 13.785650253295898, "reward_std": 0.8378366827964783, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8598126173019409, "rewards/length2tails_reward/std": 0.2199796885251999, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.612004280090332, "rewards/thermo_reward/std": 0.823988139629364, "step": 1887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.78125, "completions/mean_terminated_length": 270.78125, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "entropy": 0.08558739256113768, "epoch": 3.776, "frac_reward_zero_std": 0.0, "grad_norm": 0.1280132681131363, "learning_rate": 2.9334671951668986e-07, "loss": 0.0029, "num_tokens": 16484935.0, "reward": 13.868514060974121, "reward_std": 0.3720110058784485, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7145437002182007, "rewards/length2tails_reward/std": 0.2871835231781006, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.78125, "completions/mean_terminated_length": 272.78125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.07987150177359581, "epoch": 3.778, "frac_reward_zero_std": 0.0, "grad_norm": 0.1321270912885666, "learning_rate": 2.924400087751031e-07, "loss": -0.0145, "num_tokens": 16493696.0, "reward": 13.993861198425293, "reward_std": 0.02611471712589264, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.771440863609314, "rewards/length2tails_reward/std": 0.2611500322818756, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.829052448272705, "rewards/thermo_reward/std": 0.0, "step": 1889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.0, "completions/mean_terminated_length": 272.0, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.06972839124500751, "epoch": 3.7800000000000002, "frac_reward_zero_std": 0.0, "grad_norm": 0.08080914616584778, "learning_rate": 2.9153446143813886e-07, "loss": -0.0049, "num_tokens": 16502432.0, "reward": 13.62398910522461, "reward_std": 1.421363115310669, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.650642991065979, "rewards/kidney_reward/mean": 2.5377397537231445, "rewards/kidney_reward/std": 0.5019885897636414, "rewards/length2tails_reward/mean": 0.7057427167892456, "rewards/length2tails_reward/std": 0.3191591203212738, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.0625, "completions/mean_terminated_length": 273.0625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.07965558161959052, "epoch": 3.782, "frac_reward_zero_std": 0.0, "grad_norm": 0.20845657587051392, "learning_rate": 2.906300789947421e-07, "loss": 0.0013, "num_tokens": 16511202.0, "reward": 13.137361526489258, "reward_std": 2.915266752243042, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.452592670917511, "rewards/kidney_reward/mean": 2.374690055847168, "rewards/kidney_reward/std": 0.9955797791481018, "rewards/length2tails_reward/mean": 0.811353325843811, "rewards/length2tails_reward/std": 0.2351706475019455, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3353686332702637, "rewards/thermo_reward/std": 1.490159034729004, "step": 1891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.75, "completions/mean_terminated_length": 270.75, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 0.0726462290622294, "epoch": 3.784, "frac_reward_zero_std": 0.0, "grad_norm": 0.10030462592840195, "learning_rate": 2.8972686293194306e-07, "loss": -0.0036, "num_tokens": 16519898.0, "reward": 13.725658416748047, "reward_std": 1.0770522356033325, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.650642991065979, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7097656726837158, "rewards/length2tails_reward/std": 0.3195935785770416, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.0625, "completions/mean_terminated_length": 272.0625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08478438295423985, "epoch": 3.786, "frac_reward_zero_std": 0.0, "grad_norm": 0.08734481781721115, "learning_rate": 2.8882481473485276e-07, "loss": -0.0004, "num_tokens": 16528636.0, "reward": 13.870732307434082, "reward_std": 0.38143596053123474, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7367347478866577, "rewards/length2tails_reward/std": 0.2800016701221466, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.5625, "completions/mean_terminated_length": 270.5625, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "entropy": 0.08649180363863707, "epoch": 3.7880000000000003, "frac_reward_zero_std": 0.0, "grad_norm": 0.06724435091018677, "learning_rate": 2.879239358866632e-07, "loss": -0.0037, "num_tokens": 16537326.0, "reward": 13.382439613342285, "reward_std": 3.0069379806518555, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.5390896797180176, "rewards/kidney_reward/std": 0.49435171484947205, "rewards/length2tails_reward/mean": 0.74407958984375, "rewards/length2tails_reward/std": 0.2597999572753906, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.6158881187438965, "rewards/thermo_reward/std": 0.8037141561508179, "step": 1894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 754.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 286.8125, "completions/mean_terminated_length": 271.7419128417969, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08883518259972334, "epoch": 3.79, "frac_reward_zero_std": 0.0, "grad_norm": 0.33261749148368835, "learning_rate": 2.870242278686432e-07, "loss": -0.0576, "num_tokens": 16546536.0, "reward": 13.99350357055664, "reward_std": 0.02659435383975506, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7678635120391846, "rewards/length2tails_reward/std": 0.2659464478492737, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.829052448272705, "rewards/thermo_reward/std": 0.0, "step": 1895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.78125, "completions/mean_terminated_length": 271.78125, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "entropy": 0.09447172377258539, "epoch": 3.792, "frac_reward_zero_std": 0.0, "grad_norm": 0.25320422649383545, "learning_rate": 2.861256921601367e-07, "loss": 0.0054, "num_tokens": 16555265.0, "reward": 13.790377616882324, "reward_std": 0.943525493144989, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7760162949562073, "rewards/length2tails_reward/std": 0.2553281784057617, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.652470111846924, "rewards/thermo_reward/std": 0.7984983921051025, "step": 1896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.4375, "completions/mean_terminated_length": 272.4375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.07667552959173918, "epoch": 3.794, "frac_reward_zero_std": 0.0, "grad_norm": 0.09639758616685867, "learning_rate": 2.852283302385602e-07, "loss": -0.0021, "num_tokens": 16564015.0, "reward": 13.875651359558105, "reward_std": 0.3780142664909363, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7859266996383667, "rewards/length2tails_reward/std": 0.2399982511997223, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.40625, "completions/mean_terminated_length": 272.40625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.07352221850305796, "epoch": 3.7960000000000003, "frac_reward_zero_std": 0.0, "grad_norm": 0.08981627225875854, "learning_rate": 2.8433214357939917e-07, "loss": -0.0082, "num_tokens": 16572764.0, "reward": 13.6255521774292, "reward_std": 1.1251661777496338, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7463036775588989, "rewards/length2tails_reward/std": 0.310706228017807, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5179758071899414, "rewards/thermo_reward/std": 0.9320968985557556, "step": 1898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 272.59375, "completions/mean_terminated_length": 272.59375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08282679691910744, "epoch": 3.798, "frac_reward_zero_std": 0.0, "grad_norm": 0.08296433836221695, "learning_rate": 2.834371336562077e-07, "loss": -0.0004, "num_tokens": 16581519.0, "reward": 13.956785202026367, "reward_std": 0.22445662319660187, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7995490431785583, "rewards/length2tails_reward/std": 0.24117223918437958, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7891664505004883, "rewards/thermo_reward/std": 0.225629061460495, "step": 1899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.28125, "completions/mean_terminated_length": 270.28125, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.07467029197141528, "epoch": 3.8, "frac_reward_zero_std": 0.0, "grad_norm": 0.07870402932167053, "learning_rate": 2.8254330194060515e-07, "loss": -0.0, "num_tokens": 16590200.0, "reward": 13.618962287902832, "reward_std": 1.8877650499343872, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.526204824447632, "rewards/kidney_reward/std": 0.5672398209571838, "rewards/length2tails_reward/mean": 0.7450594902038574, "rewards/length2tails_reward/std": 0.2946070730686188, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.6145756244659424, "rewards/thermo_reward/std": 1.0059599876403809, "step": 1900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.09375, "completions/mean_terminated_length": 272.09375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08942047785967588, "epoch": 3.802, "frac_reward_zero_std": 0.0, "grad_norm": 0.9487526416778564, "learning_rate": 2.816506499022725e-07, "loss": 0.0028, "num_tokens": 16598939.0, "reward": 13.454952239990234, "reward_std": 2.020958423614502, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.506537437438965, "rewards/kidney_reward/std": 0.5413090586662292, "rewards/length2tails_reward/mean": 0.7707405090332031, "rewards/length2tails_reward/std": 0.22672009468078613, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.467665195465088, "rewards/thermo_reward/std": 1.1921191215515137, "step": 1901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.21875, "completions/mean_terminated_length": 272.21875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08338914159685373, "epoch": 3.8040000000000003, "frac_reward_zero_std": 0.0, "grad_norm": 0.08682217448949814, "learning_rate": 2.807591790089521e-07, "loss": -0.0004, "num_tokens": 16607682.0, "reward": 13.701095581054688, "reward_std": 1.0283238887786865, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7714388966560364, "rewards/length2tails_reward/std": 0.23634468019008636, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.536287307739258, "rewards/thermo_reward/std": 1.029089331626892, "step": 1902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.15625, "completions/mean_terminated_length": 273.15625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.0814851438626647, "epoch": 3.806, "frac_reward_zero_std": 0.0, "grad_norm": 0.10658857971429825, "learning_rate": 2.79868890726444e-07, "loss": 0.0046, "num_tokens": 16616455.0, "reward": 13.840827941894531, "reward_std": 0.42394959926605225, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8365510702133179, "rewards/length2tails_reward/std": 0.21344824135303497, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.46875, "completions/mean_terminated_length": 273.46875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.083722242154181, "epoch": 3.808, "frac_reward_zero_std": 0.0, "grad_norm": 0.07940282672643661, "learning_rate": 2.789797865186032e-07, "loss": -0.0018, "num_tokens": 16625238.0, "reward": 13.84145736694336, "reward_std": 0.4304184019565582, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8428469896316528, "rewards/length2tails_reward/std": 0.17144176363945007, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.71875, "completions/mean_terminated_length": 272.71875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08519653510302305, "epoch": 3.81, "frac_reward_zero_std": 0.0, "grad_norm": 0.21932841837406158, "learning_rate": 2.7809186784733827e-07, "loss": 0.0007, "num_tokens": 16633997.0, "reward": 13.91900634765625, "reward_std": 0.3112717568874359, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8206124305725098, "rewards/length2tails_reward/std": 0.23140211403369904, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.625, "completions/mean_terminated_length": 271.625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08144298056140542, "epoch": 3.8120000000000003, "frac_reward_zero_std": 0.0, "grad_norm": 0.06131415069103241, "learning_rate": 2.7720513617260855e-07, "loss": -0.0029, "num_tokens": 16642721.0, "reward": 13.456586837768555, "reward_std": 1.9371612071990967, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.533867359161377, "rewards/kidney_reward/std": 0.523894190788269, "rewards/length2tails_reward/mean": 0.7181217670440674, "rewards/length2tails_reward/std": 0.2754988372325897, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3897223472595215, "rewards/thermo_reward/std": 1.4708243608474731, "step": 1906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.84375, "completions/mean_terminated_length": 272.84375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.0787704661488533, "epoch": 3.814, "frac_reward_zero_std": 0.0, "grad_norm": 0.08822289109230042, "learning_rate": 2.7631959295242124e-07, "loss": -0.002, "num_tokens": 16651484.0, "reward": 13.826042175292969, "reward_std": 0.9932739734649658, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8458192348480225, "rewards/length2tails_reward/std": 0.16225025057792664, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.6811540126800537, "rewards/thermo_reward/std": 0.836639404296875, "step": 1907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.0, "completions/mean_terminated_length": 272.0, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0801471695303917, "epoch": 3.816, "frac_reward_zero_std": 0.0, "grad_norm": 0.3141650855541229, "learning_rate": 2.754352396428302e-07, "loss": -0.0033, "num_tokens": 16660220.0, "reward": 13.208551406860352, "reward_std": 4.000711441040039, "rewards/fitness_reward/mean": 7.050926208496094, "rewards/fitness_reward/std": 1.7550898790359497, "rewards/kidney_reward/mean": 2.4628095626831055, "rewards/kidney_reward/std": 0.9258583188056946, "rewards/length2tails_reward/mean": 0.7736960053443909, "rewards/length2tails_reward/std": 0.26007622480392456, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5174460411071777, "rewards/thermo_reward/std": 1.3342645168304443, "step": 1908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.5625, "completions/mean_terminated_length": 272.5625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.07614208897575736, "epoch": 3.818, "frac_reward_zero_std": 0.0, "grad_norm": 0.14567843079566956, "learning_rate": 2.7455207769793153e-07, "loss": 0.0002, "num_tokens": 16668974.0, "reward": 13.886138916015625, "reward_std": 0.4331798255443573, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7655285596847534, "rewards/length2tails_reward/std": 0.2807142436504364, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.65625, "completions/mean_terminated_length": 272.65625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08579348865896463, "epoch": 3.82, "frac_reward_zero_std": 0.0, "grad_norm": 0.08240564912557602, "learning_rate": 2.736701085698635e-07, "loss": 0.0007, "num_tokens": 16677731.0, "reward": 13.637163162231445, "reward_std": 1.3797341585159302, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7908780574798584, "rewards/length2tails_reward/std": 0.271894633769989, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.497770071029663, "rewards/thermo_reward/std": 1.2343865633010864, "step": 1910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.0625, "completions/mean_terminated_length": 273.0625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08684431295841932, "epoch": 3.822, "frac_reward_zero_std": 0.0, "grad_norm": 0.07356935739517212, "learning_rate": 2.7278933370880263e-07, "loss": -0.0037, "num_tokens": 16686501.0, "reward": 13.682441711425781, "reward_std": 1.0081303119659424, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8031569719314575, "rewards/length2tails_reward/std": 0.2724221348762512, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.541821002960205, "rewards/thermo_reward/std": 1.0000317096710205, "step": 1911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.28125, "completions/mean_terminated_length": 272.28125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08624691423028708, "epoch": 3.824, "frac_reward_zero_std": 0.0, "grad_norm": 0.10585896670818329, "learning_rate": 2.7190975456296193e-07, "loss": 0.0025, "num_tokens": 16695246.0, "reward": 13.877214431762695, "reward_std": 0.37398141622543335, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8015420436859131, "rewards/length2tails_reward/std": 0.22604189813137054, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.96875, "completions/mean_terminated_length": 269.96875, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "entropy": 0.07903222367167473, "epoch": 3.826, "frac_reward_zero_std": 0.0, "grad_norm": 0.13978196680545807, "learning_rate": 2.7103137257858863e-07, "loss": -0.001, "num_tokens": 16703917.0, "reward": 13.721193313598633, "reward_std": 0.9134910702705383, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7433258295059204, "rewards/length2tails_reward/std": 0.2548017203807831, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5591952800750732, "rewards/thermo_reward/std": 0.909770131111145, "step": 1913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.5625, "completions/mean_terminated_length": 270.5625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.09199087880551815, "epoch": 3.828, "frac_reward_zero_std": 0.0, "grad_norm": 0.11646510660648346, "learning_rate": 2.7015418919996057e-07, "loss": 0.004, "num_tokens": 16712607.0, "reward": 13.66675090789795, "reward_std": 1.4613165855407715, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.528264045715332, "rewards/kidney_reward/std": 0.5555903911590576, "rewards/length2tails_reward/mean": 0.8499467372894287, "rewards/length2tails_reward/std": 0.18801383674144745, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5923073291778564, "rewards/thermo_reward/std": 0.927997350692749, "step": 1914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 271.8125, "completions/mean_terminated_length": 271.8125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08600977715104818, "epoch": 3.83, "frac_reward_zero_std": 0.0, "grad_norm": 0.15362295508384705, "learning_rate": 2.6927820586938576e-07, "loss": 0.0015, "num_tokens": 16721337.0, "reward": 13.803441047668457, "reward_std": 0.5228808522224426, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7362752556800842, "rewards/length2tails_reward/std": 0.2287837564945221, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.40625, "completions/mean_terminated_length": 273.40625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08416588045656681, "epoch": 3.832, "frac_reward_zero_std": 0.0, "grad_norm": 0.09511641412973404, "learning_rate": 2.684034240271986e-07, "loss": -0.0012, "num_tokens": 16730118.0, "reward": 13.813632011413574, "reward_std": 0.5235826969146729, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8381756544113159, "rewards/length2tails_reward/std": 0.23721517622470856, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.90625, "completions/mean_terminated_length": 272.90625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.07561897998675704, "epoch": 3.834, "frac_reward_zero_std": 0.0, "grad_norm": 0.06131261587142944, "learning_rate": 2.6752984511175814e-07, "loss": -0.0001, "num_tokens": 16738883.0, "reward": 13.876596450805664, "reward_std": 0.37739771604537964, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7953763008117676, "rewards/length2tails_reward/std": 0.2556256949901581, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.28125, "completions/mean_terminated_length": 272.28125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08190339151769876, "epoch": 3.836, "frac_reward_zero_std": 0.0, "grad_norm": 0.10995667427778244, "learning_rate": 2.6665747055944553e-07, "loss": 0.0029, "num_tokens": 16747628.0, "reward": 13.7941255569458, "reward_std": 0.46538564562797546, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7683846950531006, "rewards/length2tails_reward/std": 0.2302393913269043, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.4375, "completions/mean_terminated_length": 272.4375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08025580272078514, "epoch": 3.838, "frac_reward_zero_std": 0.0, "grad_norm": 0.0882352814078331, "learning_rate": 2.657863018046611e-07, "loss": -0.0039, "num_tokens": 16756378.0, "reward": 13.775444030761719, "reward_std": 0.8556699156761169, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8053842782974243, "rewards/length2tails_reward/std": 0.24007809162139893, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.60724139213562, "rewards/thermo_reward/std": 0.8489658236503601, "step": 1919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.90625, "completions/mean_terminated_length": 272.90625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0824215766042471, "epoch": 3.84, "frac_reward_zero_std": 0.0, "grad_norm": 0.15367527306079865, "learning_rate": 2.6491634027982324e-07, "loss": 0.0017, "num_tokens": 16765143.0, "reward": 13.848047256469727, "reward_std": 0.4790394902229309, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7834751009941101, "rewards/length2tails_reward/std": 0.2990477383136749, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.96875, "completions/mean_terminated_length": 272.96875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08515264093875885, "epoch": 3.842, "frac_reward_zero_std": 0.0, "grad_norm": 0.0763060674071312, "learning_rate": 2.64047587415365e-07, "loss": 0.0018, "num_tokens": 16773910.0, "reward": 13.881507873535156, "reward_std": 0.37522459030151367, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8444912433624268, "rewards/length2tails_reward/std": 0.1953170895576477, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 283.1875, "completions/mean_terminated_length": 283.1875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0946641843765974, "epoch": 3.844, "frac_reward_zero_std": 0.0, "grad_norm": 0.3928823173046112, "learning_rate": 2.6318004463973233e-07, "loss": -0.0102, "num_tokens": 16783004.0, "reward": 13.092012405395508, "reward_std": 4.512775421142578, "rewards/fitness_reward/mean": 6.999444007873535, "rewards/fitness_reward/std": 2.046316623687744, "rewards/kidney_reward/mean": 2.412200927734375, "rewards/kidney_reward/std": 1.0636920928955078, "rewards/length2tails_reward/mean": 0.7764885425567627, "rewards/length2tails_reward/std": 0.2910473942756653, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.502717971801758, "rewards/thermo_reward/std": 1.4153804779052734, "step": 1922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 270.96875, "completions/mean_terminated_length": 270.96875, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.08065614709630609, "epoch": 3.846, "frac_reward_zero_std": 0.0, "grad_norm": 0.08240630477666855, "learning_rate": 2.6231371337938144e-07, "loss": -0.0057, "num_tokens": 16791707.0, "reward": 13.549034118652344, "reward_std": 1.318332552909851, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7552239894866943, "rewards/length2tails_reward/std": 0.3136855959892273, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3858466148376465, "rewards/thermo_reward/std": 1.3002216815948486, "step": 1923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.21875, "completions/mean_terminated_length": 272.21875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08426021132618189, "epoch": 3.848, "frac_reward_zero_std": 0.0, "grad_norm": 0.12145797163248062, "learning_rate": 2.6144859505877603e-07, "loss": -0.0031, "num_tokens": 16800450.0, "reward": 13.404792785644531, "reward_std": 2.175577163696289, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.510470390319824, "rewards/kidney_reward/std": 0.6562471985816956, "rewards/length2tails_reward/mean": 0.7557017803192139, "rewards/length2tails_reward/std": 0.27980005741119385, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4150757789611816, "rewards/thermo_reward/std": 1.337871789932251, "step": 1924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 276.5, "completions/mean_terminated_length": 276.5, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.10557070840150118, "epoch": 3.85, "frac_reward_zero_std": 0.0, "grad_norm": 0.2935408651828766, "learning_rate": 2.6058469110038626e-07, "loss": -0.0007, "num_tokens": 16809330.0, "reward": 13.773045539855957, "reward_std": 0.5495082139968872, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8311747312545776, "rewards/length2tails_reward/std": 0.23529881238937378, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.125, "completions/mean_terminated_length": 273.125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08785425685346127, "epoch": 3.852, "frac_reward_zero_std": 0.0, "grad_norm": 0.11054041981697083, "learning_rate": 2.597220029246846e-07, "loss": -0.0045, "num_tokens": 16818102.0, "reward": 13.029808044433594, "reward_std": 2.00207781791687, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5444021224975586, "rewards/kidney_reward/std": 0.2592725455760956, "rewards/length2tails_reward/mean": 0.8216935396194458, "rewards/length2tails_reward/std": 0.22593268752098083, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 2.942051410675049, "rewards/thermo_reward/std": 1.8460910320281982, "step": 1926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.875, "completions/mean_terminated_length": 271.875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.07975840009748936, "epoch": 3.854, "frac_reward_zero_std": 0.0, "grad_norm": 0.12637239694595337, "learning_rate": 2.5886053195014534e-07, "loss": -0.0028, "num_tokens": 16826834.0, "reward": 13.594480514526367, "reward_std": 1.4269717931747437, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7686116695404053, "rewards/length2tails_reward/std": 0.2542043924331665, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4573142528533936, "rewards/thermo_reward/std": 1.3170541524887085, "step": 1927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 272.90625, "completions/mean_terminated_length": 272.90625, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.07655230350792408, "epoch": 3.856, "frac_reward_zero_std": 0.0, "grad_norm": 0.17291785776615143, "learning_rate": 2.5800027959324087e-07, "loss": 0.0039, "num_tokens": 16835599.0, "reward": 13.932228088378906, "reward_std": 0.3779231905937195, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8275623321533203, "rewards/length2tails_reward/std": 0.2156936079263687, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7891664505004883, "rewards/thermo_reward/std": 0.225629061460495, "step": 1928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.21875, "completions/mean_terminated_length": 273.21875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08698383998125792, "epoch": 3.858, "frac_reward_zero_std": 0.0, "grad_norm": 0.10282512754201889, "learning_rate": 2.571412472684401e-07, "loss": 0.0015, "num_tokens": 16844374.0, "reward": 13.840457916259766, "reward_std": 0.43196240067481995, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8328460454940796, "rewards/length2tails_reward/std": 0.19952915608882904, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.0, "completions/mean_terminated_length": 273.0, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.07852550689131021, "epoch": 3.86, "frac_reward_zero_std": 0.0, "grad_norm": 0.08711478114128113, "learning_rate": 2.5628343638820625e-07, "loss": -0.0042, "num_tokens": 16853142.0, "reward": 13.91780948638916, "reward_std": 0.3206358551979065, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8086421489715576, "rewards/length2tails_reward/std": 0.24440136551856995, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 272.625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08650944288820028, "epoch": 3.862, "frac_reward_zero_std": 0.0, "grad_norm": 0.13395126163959503, "learning_rate": 2.554268483629931e-07, "loss": -0.0035, "num_tokens": 16861898.0, "reward": 13.796703338623047, "reward_std": 0.4803380072116852, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7941558361053467, "rewards/length2tails_reward/std": 0.27170467376708984, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 271.28125, "completions/mean_terminated_length": 271.28125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08016407676041126, "epoch": 3.864, "frac_reward_zero_std": 0.0, "grad_norm": 0.11705422401428223, "learning_rate": 2.5457148460124476e-07, "loss": -0.0011, "num_tokens": 16870611.0, "reward": 13.907598495483398, "reward_std": 0.31562539935112, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7065348625183105, "rewards/length2tails_reward/std": 0.28879106044769287, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.53125, "completions/mean_terminated_length": 271.53125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08776333183050156, "epoch": 3.866, "frac_reward_zero_std": 0.0, "grad_norm": 0.07915785908699036, "learning_rate": 2.5371734650939204e-07, "loss": -0.0035, "num_tokens": 16879332.0, "reward": 13.870550155639648, "reward_std": 0.38491547107696533, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7349153757095337, "rewards/length2tails_reward/std": 0.2707245945930481, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.625, "completions/mean_terminated_length": 273.625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.07789402874186635, "epoch": 3.868, "frac_reward_zero_std": 0.0, "grad_norm": 0.1011756882071495, "learning_rate": 2.528644354918503e-07, "loss": 0.0028, "num_tokens": 16888120.0, "reward": 13.960830688476562, "reward_std": 0.22377075254917145, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8399984240531921, "rewards/length2tails_reward/std": 0.21737302839756012, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7891664505004883, "rewards/thermo_reward/std": 0.225629061460495, "step": 1934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.90625, "completions/mean_terminated_length": 271.90625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08203192008659244, "epoch": 3.87, "frac_reward_zero_std": 0.0, "grad_norm": 0.08774164319038391, "learning_rate": 2.5201275295101775e-07, "loss": 0.0019, "num_tokens": 16896853.0, "reward": 13.91592788696289, "reward_std": 0.3097189962863922, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7898266315460205, "rewards/length2tails_reward/std": 0.19600234925746918, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 272.09375, "completions/mean_terminated_length": 272.09375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08788663055747747, "epoch": 3.872, "frac_reward_zero_std": 0.0, "grad_norm": 0.10273593664169312, "learning_rate": 2.511623002872718e-07, "loss": 0.0005, "num_tokens": 16905592.0, "reward": 13.651347160339355, "reward_std": 1.1073802709579468, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.803066611289978, "rewards/length2tails_reward/std": 0.2153354436159134, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.510735511779785, "rewards/thermo_reward/std": 0.968666672706604, "step": 1936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.875, "completions/mean_terminated_length": 270.875, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 0.08587850071489811, "epoch": 3.874, "frac_reward_zero_std": 0.0, "grad_norm": 0.15995830297470093, "learning_rate": 2.5031307889896847e-07, "loss": 0.0002, "num_tokens": 16914292.0, "reward": 13.452787399291992, "reward_std": 1.6182825565338135, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7807340025901794, "rewards/length2tails_reward/std": 0.25488072633743286, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3992772102355957, "rewards/thermo_reward/std": 1.2482622861862183, "step": 1937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.15625, "completions/mean_terminated_length": 271.15625, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "entropy": 0.08146081399172544, "epoch": 3.876, "frac_reward_zero_std": 0.0, "grad_norm": 0.11728298664093018, "learning_rate": 2.494650901824389e-07, "loss": -0.0016, "num_tokens": 16923001.0, "reward": 13.694450378417969, "reward_std": 1.0769379138946533, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.789922833442688, "rewards/length2tails_reward/std": 0.3011004328727722, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.555152177810669, "rewards/thermo_reward/std": 0.9306294918060303, "step": 1938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 272.15625, "completions/mean_terminated_length": 272.15625, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 0.08420975971966982, "epoch": 3.878, "frac_reward_zero_std": 0.0, "grad_norm": 0.20011083781719208, "learning_rate": 2.486183355319875e-07, "loss": -0.0079, "num_tokens": 16931742.0, "reward": 13.739608764648438, "reward_std": 0.861034631729126, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8160064220428467, "rewards/length2tails_reward/std": 0.20948392152786255, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.570343494415283, "rewards/thermo_reward/std": 0.852788507938385, "step": 1939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.375, "completions/mean_terminated_length": 273.375, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.08624033536761999, "epoch": 3.88, "frac_reward_zero_std": 0.0, "grad_norm": 0.10202612727880478, "learning_rate": 2.4777281633988976e-07, "loss": 0.0011, "num_tokens": 16940522.0, "reward": 13.691545486450195, "reward_std": 0.9300704002380371, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8535068035125732, "rewards/length2tails_reward/std": 0.1830395758152008, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5185298919677734, "rewards/thermo_reward/std": 0.929313600063324, "step": 1940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 273.625, "completions/mean_terminated_length": 273.625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.07235424825921655, "epoch": 3.882, "frac_reward_zero_std": 0.0, "grad_norm": 0.10096503794193268, "learning_rate": 2.4692853399638913e-07, "loss": -0.0021, "num_tokens": 16949310.0, "reward": 13.85297966003418, "reward_std": 0.48421013355255127, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8328012228012085, "rewards/length2tails_reward/std": 0.23313400149345398, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.46875, "completions/mean_terminated_length": 272.46875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08609567582607269, "epoch": 3.884, "frac_reward_zero_std": 0.0, "grad_norm": 0.06079360470175743, "learning_rate": 2.4608548988969593e-07, "loss": -0.0058, "num_tokens": 16958061.0, "reward": 13.425346374511719, "reward_std": 3.0079617500305176, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.5390896797180176, "rewards/kidney_reward/std": 0.49435171484947205, "rewards/length2tails_reward/mean": 0.7742855548858643, "rewards/length2tails_reward/std": 0.2837405800819397, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.6557741165161133, "rewards/thermo_reward/std": 0.7805873155593872, "step": 1942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.4375, "completions/mean_terminated_length": 272.4375, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "entropy": 0.07651135697960854, "epoch": 3.886, "frac_reward_zero_std": 0.0, "grad_norm": 0.08636076003313065, "learning_rate": 2.452436854059843e-07, "loss": -0.0013, "num_tokens": 16966811.0, "reward": 13.842947006225586, "reward_std": 0.43455979228019714, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8577354550361633, "rewards/length2tails_reward/std": 0.1791587769985199, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.78125, "completions/mean_terminated_length": 272.78125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08422570209950209, "epoch": 3.888, "frac_reward_zero_std": 0.0, "grad_norm": 0.15154993534088135, "learning_rate": 2.4440312192939037e-07, "loss": -0.0, "num_tokens": 16975572.0, "reward": 12.78995418548584, "reward_std": 3.594736337661743, "rewards/fitness_reward/mean": 7.188657760620117, "rewards/fitness_reward/std": 0.7179933190345764, "rewards/kidney_reward/mean": 2.3057146072387695, "rewards/kidney_reward/std": 1.1698397397994995, "rewards/length2tails_reward/mean": 0.7942888140678406, "rewards/length2tails_reward/std": 0.27767103910446167, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1161532402038574, "rewards/thermo_reward/std": 1.8846759796142578, "step": 1944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.0, "completions/mean_terminated_length": 271.0, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.07934307120740414, "epoch": 3.89, "frac_reward_zero_std": 0.0, "grad_norm": 0.16269008815288544, "learning_rate": 2.435638008420098e-07, "loss": 0.0047, "num_tokens": 16984276.0, "reward": 13.656649589538574, "reward_std": 1.0255653858184814, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7091237902641296, "rewards/length2tails_reward/std": 0.2751384675502777, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5254311561584473, "rewards/thermo_reward/std": 0.8948387503623962, "step": 1945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 287.40625, "completions/mean_terminated_length": 287.40625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.09849395509809256, "epoch": 3.892, "frac_reward_zero_std": 0.0, "grad_norm": 0.30336740612983704, "learning_rate": 2.4272572352389485e-07, "loss": -0.0189, "num_tokens": 16993505.0, "reward": 13.672941207885742, "reward_std": 1.2645598649978638, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7417504787445068, "rewards/length2tails_reward/std": 0.3114211857318878, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5384607315063477, "rewards/thermo_reward/std": 1.2190366983413696, "step": 1946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.125, "completions/mean_terminated_length": 272.125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.0812322972342372, "epoch": 3.894, "frac_reward_zero_std": 0.0, "grad_norm": 0.11934258043766022, "learning_rate": 2.41888891353053e-07, "loss": -0.0014, "num_tokens": 17002245.0, "reward": 13.91276741027832, "reward_std": 0.31673482060432434, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7582290172576904, "rewards/length2tails_reward/std": 0.23247385025024414, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.0625, "completions/mean_terminated_length": 273.0625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08449868764728308, "epoch": 3.896, "frac_reward_zero_std": 0.0, "grad_norm": 0.11675684154033661, "learning_rate": 2.410533057054446e-07, "loss": -0.0017, "num_tokens": 17011015.0, "reward": 13.878592491149902, "reward_std": 0.37778231501579285, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8153381943702698, "rewards/length2tails_reward/std": 0.26463058590888977, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.15625, "completions/mean_terminated_length": 271.15625, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "entropy": 0.0786692202091217, "epoch": 3.898, "frac_reward_zero_std": 0.0, "grad_norm": 0.12710154056549072, "learning_rate": 2.4021896795498044e-07, "loss": 0.004, "num_tokens": 17019724.0, "reward": 13.421014785766602, "reward_std": 1.9942625761032104, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.4906506538391113, "rewards/kidney_reward/std": 0.6279568672180176, "rewards/length2tails_reward/mean": 0.7816104888916016, "rewards/length2tails_reward/std": 0.2606870234012604, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4485268592834473, "rewards/thermo_reward/std": 1.0938901901245117, "step": 1949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.46875, "completions/mean_terminated_length": 272.46875, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "entropy": 0.08455433277413249, "epoch": 3.9, "frac_reward_zero_std": 0.0, "grad_norm": 0.12284710258245468, "learning_rate": 2.3938587947351917e-07, "loss": 0.004, "num_tokens": 17028475.0, "reward": 13.583101272583008, "reward_std": 1.096427083015442, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8623867034912109, "rewards/length2tails_reward/std": 0.21832647919654846, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.436556816101074, "rewards/thermo_reward/std": 0.9650663733482361, "step": 1950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.0, "completions/mean_terminated_length": 273.0, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.090028902515769, "epoch": 3.902, "frac_reward_zero_std": 0.0, "grad_norm": 0.09488416463136673, "learning_rate": 2.3855404163086556e-07, "loss": -0.0075, "num_tokens": 17037243.0, "reward": 13.706623077392578, "reward_std": 1.1716996431350708, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8330482244491577, "rewards/length2tails_reward/std": 0.2136765569448471, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5356531143188477, "rewards/thermo_reward/std": 1.1550958156585693, "step": 1951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 273.71875, "completions/mean_terminated_length": 273.71875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.0850351257249713, "epoch": 3.904, "frac_reward_zero_std": 0.0, "grad_norm": 0.14141486585140228, "learning_rate": 2.3772345579476816e-07, "loss": -0.0007, "num_tokens": 17046034.0, "reward": 13.774679183959961, "reward_std": 0.5537977814674377, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8475044369697571, "rewards/length2tails_reward/std": 0.1927335411310196, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.07830340228974819, "epoch": 3.906, "frac_reward_zero_std": 0.0, "grad_norm": 0.0867738425731659, "learning_rate": 2.3689412333091618e-07, "loss": 0.0036, "num_tokens": 17054762.0, "reward": 13.950662612915039, "reward_std": 0.22277358174324036, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7383153438568115, "rewards/length2tails_reward/std": 0.2902863621711731, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7891664505004883, "rewards/thermo_reward/std": 0.225629061460495, "step": 1953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.53125, "completions/mean_terminated_length": 271.53125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.07980364747345448, "epoch": 3.908, "frac_reward_zero_std": 0.0, "grad_norm": 0.13662275671958923, "learning_rate": 2.3606604560293875e-07, "loss": 0.0002, "num_tokens": 17063483.0, "reward": 13.750734329223633, "reward_std": 0.9528560638427734, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7522097826004028, "rewards/length2tails_reward/std": 0.2517887353897095, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.6152076721191406, "rewards/thermo_reward/std": 0.807259738445282, "step": 1954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 271.15625, "completions/mean_terminated_length": 271.15625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.08249637857079506, "epoch": 3.91, "frac_reward_zero_std": 0.0, "grad_norm": 0.09899752587080002, "learning_rate": 2.352392239724016e-07, "loss": 0.0014, "num_tokens": 17072192.0, "reward": 13.777750015258789, "reward_std": 0.8359129428863525, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8064588308334351, "rewards/length2tails_reward/std": 0.2694641649723053, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.6094391345977783, "rewards/thermo_reward/std": 0.8374254107475281, "step": 1955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.09375, "completions/mean_terminated_length": 273.09375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.07944468408823013, "epoch": 3.912, "frac_reward_zero_std": 0.0, "grad_norm": 0.1025310754776001, "learning_rate": 2.3441365979880522e-07, "loss": 0.002, "num_tokens": 17080963.0, "reward": 13.541351318359375, "reward_std": 1.7019767761230469, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5238256454467773, "rewards/kidney_reward/std": 0.5806989669799805, "rewards/length2tails_reward/mean": 0.8096390962600708, "rewards/length2tails_reward/std": 0.2314687967300415, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4753763675689697, "rewards/thermo_reward/std": 1.15151047706604, "step": 1956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.625, "completions/mean_terminated_length": 271.625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08525500353425741, "epoch": 3.914, "frac_reward_zero_std": 0.0, "grad_norm": 0.13864275813102722, "learning_rate": 2.335893544395826e-07, "loss": -0.0004, "num_tokens": 17089687.0, "reward": 13.91266918182373, "reward_std": 0.31485801935195923, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7572414875030518, "rewards/length2tails_reward/std": 0.22235172986984253, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.34375, "completions/mean_terminated_length": 271.34375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 0.08531990181654692, "epoch": 3.916, "frac_reward_zero_std": 0.0, "grad_norm": 0.08629897981882095, "learning_rate": 2.3276630925009632e-07, "loss": 0.0032, "num_tokens": 17098402.0, "reward": 13.874603271484375, "reward_std": 0.37391039729118347, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7754376530647278, "rewards/length2tails_reward/std": 0.27152007818222046, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.96875, "completions/mean_terminated_length": 271.96875, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 0.07741576712578535, "epoch": 3.918, "frac_reward_zero_std": 0.0, "grad_norm": 0.11358784884214401, "learning_rate": 2.3194452558363776e-07, "loss": 0.0015, "num_tokens": 17107137.0, "reward": 13.614516258239746, "reward_std": 1.0182923078536987, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8247540593147278, "rewards/length2tails_reward/std": 0.23363043367862701, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4717354774475098, "rewards/thermo_reward/std": 0.9769327640533447, "step": 1959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.9375, "completions/mean_terminated_length": 273.9375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08296280819922686, "epoch": 3.92, "frac_reward_zero_std": 0.0, "grad_norm": 0.0738103836774826, "learning_rate": 2.3112400479142347e-07, "loss": -0.0021, "num_tokens": 17115935.0, "reward": 13.73696231842041, "reward_std": 0.8988210558891296, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8741310834884644, "rewards/length2tails_reward/std": 0.19006100296974182, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5618844032287598, "rewards/thermo_reward/std": 0.8959512710571289, "step": 1960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 274.09375, "completions/mean_terminated_length": 274.09375, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.08895572554320097, "epoch": 3.922, "frac_reward_zero_std": 0.0, "grad_norm": 0.16806237399578094, "learning_rate": 2.3030474822259394e-07, "loss": -0.0007, "num_tokens": 17124738.0, "reward": 13.75033950805664, "reward_std": 1.0203158855438232, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.877525806427002, "rewards/length2tails_reward/std": 0.18097041547298431, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.6022818088531494, "rewards/thermo_reward/std": 0.8750991225242615, "step": 1961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.125, "completions/mean_terminated_length": 272.125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08735064696520567, "epoch": 3.924, "frac_reward_zero_std": 0.0, "grad_norm": 0.0669313445687294, "learning_rate": 2.2948675722421085e-07, "loss": 0.0011, "num_tokens": 17133478.0, "reward": 13.766349792480469, "reward_std": 0.8892486095428467, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7945989966392517, "rewards/length2tails_reward/std": 0.2241227924823761, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5992252826690674, "rewards/thermo_reward/std": 0.8912632465362549, "step": 1962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.53125, "completions/mean_terminated_length": 272.53125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08859865088015795, "epoch": 3.926, "frac_reward_zero_std": 0.0, "grad_norm": 0.09937973320484161, "learning_rate": 2.2867003314125443e-07, "loss": -0.0028, "num_tokens": 17142231.0, "reward": 13.879058837890625, "reward_std": 0.384976863861084, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.820002555847168, "rewards/length2tails_reward/std": 0.21553631126880646, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.71875, "completions/mean_terminated_length": 273.71875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08682218100875616, "epoch": 3.928, "frac_reward_zero_std": 0.0, "grad_norm": 0.08087195456027985, "learning_rate": 2.278545773166225e-07, "loss": -0.0032, "num_tokens": 17151022.0, "reward": 13.221725463867188, "reward_std": 3.055856227874756, "rewards/fitness_reward/mean": 6.987466335296631, "rewards/fitness_reward/std": 2.11407208442688, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8249701857566833, "rewards/length2tails_reward/std": 0.2575210630893707, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4526407718658447, "rewards/thermo_reward/std": 0.8887622356414795, "step": 1964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 271.96875, "completions/mean_terminated_length": 271.96875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.10153763461858034, "epoch": 3.93, "frac_reward_zero_std": 0.0, "grad_norm": 0.1791462004184723, "learning_rate": 2.2704039109112716e-07, "loss": 0.0009, "num_tokens": 17159757.0, "reward": 13.688663482666016, "reward_std": 1.031781554222107, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.854976236820221, "rewards/length2tails_reward/std": 0.18852630257606506, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5702195167541504, "rewards/thermo_reward/std": 0.8534172773361206, "step": 1965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.6875, "completions/mean_terminated_length": 272.6875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08356020133942366, "epoch": 3.932, "frac_reward_zero_std": 0.0, "grad_norm": 0.09256550669670105, "learning_rate": 2.262274758034931e-07, "loss": 0.0042, "num_tokens": 17168515.0, "reward": 13.879671096801758, "reward_std": 0.3743593692779541, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8261242508888245, "rewards/length2tails_reward/std": 0.181783989071846, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.96875, "completions/mean_terminated_length": 272.96875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08657645247876644, "epoch": 3.934, "frac_reward_zero_std": 0.0, "grad_norm": 0.11939027160406113, "learning_rate": 2.254158327903557e-07, "loss": 0.002, "num_tokens": 17177282.0, "reward": 13.892132759094238, "reward_std": 0.43261972069740295, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8254703283309937, "rewards/length2tails_reward/std": 0.2279188632965088, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.6875, "completions/mean_terminated_length": 271.6875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.08298607263714075, "epoch": 3.936, "frac_reward_zero_std": 0.0, "grad_norm": 0.08153997361660004, "learning_rate": 2.246054633862575e-07, "loss": -0.0006, "num_tokens": 17186008.0, "reward": 13.959012031555176, "reward_std": 0.22808024287223816, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.821814775466919, "rewards/length2tails_reward/std": 0.22938142716884613, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7891664505004883, "rewards/thermo_reward/std": 0.225629061460495, "step": 1968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 270.90625, "completions/mean_terminated_length": 270.90625, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 0.08575815940275788, "epoch": 3.9379999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.11718124151229858, "learning_rate": 2.2379636892364717e-07, "loss": 0.0043, "num_tokens": 17194709.0, "reward": 13.644588470458984, "reward_std": 1.0277619361877441, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.5717613697052, "rewards/kidney_reward/std": 0.2153141349554062, "rewards/length2tails_reward/mean": 0.7811826467514038, "rewards/length2tails_reward/std": 0.28118860721588135, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.53352427482605, "rewards/thermo_reward/std": 0.8549103736877441, "step": 1969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 276.34375, "completions/mean_terminated_length": 276.34375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08746970538049936, "epoch": 3.94, "frac_reward_zero_std": 0.0, "grad_norm": 0.25457999110221863, "learning_rate": 2.229885507328776e-07, "loss": -0.0016, "num_tokens": 17203584.0, "reward": 13.730911254882812, "reward_std": 0.5785952210426331, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.8086965084075928, "rewards/length2tails_reward/std": 0.2609972357749939, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5897364616394043, "rewards/thermo_reward/std": 0.5061468482017517, "step": 1970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.71875, "completions/mean_terminated_length": 272.71875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08229434955865145, "epoch": 3.942, "frac_reward_zero_std": 0.0, "grad_norm": 0.11690180748701096, "learning_rate": 2.2218201014220262e-07, "loss": 0.0013, "num_tokens": 17212343.0, "reward": 13.480477333068848, "reward_std": 1.8184738159179688, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.498993396759033, "rewards/kidney_reward/std": 0.7211709022521973, "rewards/length2tails_reward/mean": 0.8054592609405518, "rewards/length2tails_reward/std": 0.22931167483329773, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4397528171539307, "rewards/thermo_reward/std": 1.1389209032058716, "step": 1971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 272.625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.07960274396464229, "epoch": 3.944, "frac_reward_zero_std": 0.0, "grad_norm": 0.12034840136766434, "learning_rate": 2.2137674847777576e-07, "loss": 0.0023, "num_tokens": 17221099.0, "reward": 13.918210983276367, "reward_std": 0.31029683351516724, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8126543164253235, "rewards/length2tails_reward/std": 0.22591090202331543, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 272.1875, "completions/mean_terminated_length": 272.1875, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 0.09035251941531897, "epoch": 3.9459999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.07418270409107208, "learning_rate": 2.205727670636478e-07, "loss": -0.0054, "num_tokens": 17229841.0, "reward": 13.46790885925293, "reward_std": 3.006690502166748, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.5390896797180176, "rewards/kidney_reward/std": 0.49435171484947205, "rewards/length2tails_reward/mean": 0.8010534048080444, "rewards/length2tails_reward/std": 0.26960352063179016, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.69566011428833, "rewards/thermo_reward/std": 0.7545809149742126, "step": 1973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 274.09375, "completions/mean_terminated_length": 274.09375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08542555291205645, "epoch": 3.948, "frac_reward_zero_std": 0.0, "grad_norm": 0.13188742101192474, "learning_rate": 2.197700672217635e-07, "loss": -0.0042, "num_tokens": 17238644.0, "reward": 12.878170013427734, "reward_std": 4.818774700164795, "rewards/fitness_reward/mean": 7.000814437866211, "rewards/fitness_reward/std": 2.0385630130767822, "rewards/kidney_reward/mean": 2.3968443870544434, "rewards/kidney_reward/std": 1.14970862865448, "rewards/length2tails_reward/mean": 0.7748308777809143, "rewards/length2tails_reward/std": 0.26282769441604614, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3030285835266113, "rewards/thermo_reward/std": 1.8083046674728394, "step": 1974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.4375, "completions/mean_terminated_length": 272.4375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08252169284969568, "epoch": 3.95, "frac_reward_zero_std": 0.0, "grad_norm": 0.09868843853473663, "learning_rate": 2.1896865027196143e-07, "loss": 0.0019, "num_tokens": 17247394.0, "reward": 13.794919967651367, "reward_std": 0.4678279757499695, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7763217687606812, "rewards/length2tails_reward/std": 0.28034308552742004, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.3125, "completions/mean_terminated_length": 272.3125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.0822441759519279, "epoch": 3.952, "frac_reward_zero_std": 0.0, "grad_norm": 0.07563626766204834, "learning_rate": 2.181685175319702e-07, "loss": -0.0039, "num_tokens": 17256140.0, "reward": 13.918163299560547, "reward_std": 0.3168083131313324, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8121821284294128, "rewards/length2tails_reward/std": 0.2070104479789734, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7492804527282715, "rewards/thermo_reward/std": 0.3138989210128784, "step": 1976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.40625, "completions/mean_terminated_length": 272.40625, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.08298068400472403, "epoch": 3.9539999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.0864865854382515, "learning_rate": 2.1736967031740737e-07, "loss": -0.001, "num_tokens": 17264889.0, "reward": 13.878406524658203, "reward_std": 0.3799617886543274, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8134766817092896, "rewards/length2tails_reward/std": 0.18320176005363464, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.28125, "completions/mean_terminated_length": 272.28125, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.08510293252766132, "epoch": 3.956, "frac_reward_zero_std": 0.0, "grad_norm": 0.07798423618078232, "learning_rate": 2.1657210994177643e-07, "loss": 0.0043, "num_tokens": 17273634.0, "reward": 13.881671905517578, "reward_std": 0.37807029485702515, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8461273312568665, "rewards/length2tails_reward/std": 0.20430795848369598, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.9375, "completions/mean_terminated_length": 271.9375, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "entropy": 0.08048784267157316, "epoch": 3.958, "frac_reward_zero_std": 0.0, "grad_norm": 0.10507718473672867, "learning_rate": 2.1577583771646467e-07, "loss": 0.0022, "num_tokens": 17282368.0, "reward": 13.877695083618164, "reward_std": 0.3744347095489502, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.806357741355896, "rewards/length2tails_reward/std": 0.22114646434783936, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 269.375, "completions/mean_terminated_length": 269.375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.07727627642452717, "epoch": 3.96, "frac_reward_zero_std": 0.0, "grad_norm": 0.0931129902601242, "learning_rate": 2.1498085495074193e-07, "loss": -0.0019, "num_tokens": 17291020.0, "reward": 13.947714805603027, "reward_std": 0.23339204490184784, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.708836555480957, "rewards/length2tails_reward/std": 0.31088170409202576, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7891664505004883, "rewards/thermo_reward/std": 0.225629061460495, "step": 1980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 271.6875, "completions/mean_terminated_length": 271.6875, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.07627571607008576, "epoch": 3.9619999999999997, "frac_reward_zero_std": 0.0, "grad_norm": 0.19174930453300476, "learning_rate": 2.1418716295175766e-07, "loss": 0.0077, "num_tokens": 17299746.0, "reward": 13.401413917541504, "reward_std": 2.5488474369049072, "rewards/fitness_reward/mean": 7.303675651550293, "rewards/fitness_reward/std": 0.3253214955329895, "rewards/kidney_reward/mean": 2.450007915496826, "rewards/kidney_reward/std": 0.8526635766029358, "rewards/length2tails_reward/mean": 0.7977835536003113, "rewards/length2tails_reward/std": 0.23642198741436005, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.4679512977600098, "rewards/thermo_reward/std": 1.3959442377090454, "step": 1981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 271.5625, "completions/mean_terminated_length": 271.5625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08476059697568417, "epoch": 3.964, "frac_reward_zero_std": 0.0, "grad_norm": 0.09583441913127899, "learning_rate": 2.1339476302453873e-07, "loss": 0.0036, "num_tokens": 17308468.0, "reward": 12.910294532775879, "reward_std": 5.181497573852539, "rewards/fitness_reward/mean": 6.980969429016113, "rewards/fitness_reward/std": 2.1508259773254395, "rewards/kidney_reward/mean": 2.4055721759796143, "rewards/kidney_reward/std": 1.2496414184570312, "rewards/length2tails_reward/mean": 0.7198496460914612, "rewards/length2tails_reward/std": 0.29080212116241455, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3517682552337646, "rewards/thermo_reward/std": 1.819327712059021, "step": 1982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.46875, "completions/mean_terminated_length": 272.46875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08854109048843384, "epoch": 3.966, "frac_reward_zero_std": 0.0, "grad_norm": 0.11510281264781952, "learning_rate": 2.1260365647198797e-07, "loss": 0.0028, "num_tokens": 17317219.0, "reward": 13.87903118133545, "reward_std": 0.37419331073760986, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8197149038314819, "rewards/length2tails_reward/std": 0.20691503584384918, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.34375, "completions/mean_terminated_length": 272.34375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.07455269154161215, "epoch": 3.968, "frac_reward_zero_std": 0.0, "grad_norm": 0.20295102894306183, "learning_rate": 2.118138445948815e-07, "loss": 0.0036, "num_tokens": 17325966.0, "reward": 13.744754791259766, "reward_std": 0.9908297061920166, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7723274230957031, "rewards/length2tails_reward/std": 0.27208390831947327, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.6072168350219727, "rewards/thermo_reward/std": 0.8490945100784302, "step": 1984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 273.59375, "completions/mean_terminated_length": 273.59375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08819251973181963, "epoch": 3.9699999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.16574658453464508, "learning_rate": 2.1102532869186585e-07, "loss": 0.0052, "num_tokens": 17334753.0, "reward": 13.774247169494629, "reward_std": 0.5482454299926758, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.843185544013977, "rewards/length2tails_reward/std": 0.22929775714874268, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.629622459411621, "rewards/thermo_reward/std": 0.4708483815193176, "step": 1985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.625, "completions/mean_terminated_length": 271.625, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "entropy": 0.08976717293262482, "epoch": 3.972, "frac_reward_zero_std": 0.0, "grad_norm": 0.18116842210292816, "learning_rate": 2.102381100594577e-07, "loss": 0.0013, "num_tokens": 17343477.0, "reward": 13.875925064086914, "reward_std": 0.38003987073898315, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7886615991592407, "rewards/length2tails_reward/std": 0.25267016887664795, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 274.09375, "completions/mean_terminated_length": 274.09375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.09118715906515718, "epoch": 3.974, "frac_reward_zero_std": 0.0, "grad_norm": 0.1178445890545845, "learning_rate": 2.094521899920403e-07, "loss": 0.0036, "num_tokens": 17352280.0, "reward": 13.685848236083984, "reward_std": 0.5587000846862793, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8821921944618225, "rewards/length2tails_reward/std": 0.18764744699001312, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.50996470451355, "rewards/thermo_reward/std": 0.5615194439888, "step": 1987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 270.15625, "completions/mean_terminated_length": 270.15625, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 0.07772285584360361, "epoch": 3.976, "frac_reward_zero_std": 0.0, "grad_norm": 0.08714106678962708, "learning_rate": 2.0866756978186162e-07, "loss": -0.0016, "num_tokens": 17360957.0, "reward": 13.827531814575195, "reward_std": 0.4348272383213043, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7035890221595764, "rewards/length2tails_reward/std": 0.30343014001846313, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 271.5625, "completions/mean_terminated_length": 271.5625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.07163404440507293, "epoch": 3.9779999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.09531030058860779, "learning_rate": 2.078842507190328e-07, "loss": -0.0063, "num_tokens": 17369679.0, "reward": 13.312138557434082, "reward_std": 2.605790376663208, "rewards/fitness_reward/mean": 7.246166706085205, "rewards/fitness_reward/std": 0.650642991065979, "rewards/kidney_reward/mean": 2.5147488117218018, "rewards/kidney_reward/std": 0.6320452094078064, "rewards/length2tails_reward/mean": 0.6933004856109619, "rewards/length2tails_reward/std": 0.31690841913223267, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.3818931579589844, "rewards/thermo_reward/std": 1.4718438386917114, "step": 1989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.625, "completions/mean_terminated_length": 271.625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08093912713229656, "epoch": 3.98, "frac_reward_zero_std": 0.0, "grad_norm": 0.06563757359981537, "learning_rate": 2.0710223409152471e-07, "loss": -0.0047, "num_tokens": 17378403.0, "reward": 13.421937942504883, "reward_std": 3.006488561630249, "rewards/fitness_reward/mean": 7.053053855895996, "rewards/fitness_reward/std": 1.7430548667907715, "rewards/kidney_reward/mean": 2.5390896797180176, "rewards/kidney_reward/std": 0.49435171484947205, "rewards/length2tails_reward/mean": 0.7401957511901855, "rewards/length2tails_reward/std": 0.3054744601249695, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.6557741165161133, "rewards/thermo_reward/std": 0.7805873155593872, "step": 1990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.875, "completions/mean_terminated_length": 271.875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.0830417824909091, "epoch": 3.982, "frac_reward_zero_std": 0.0, "grad_norm": 0.08848419785499573, "learning_rate": 2.0632152118516778e-07, "loss": -0.0024, "num_tokens": 17387135.0, "reward": 13.952494621276855, "reward_std": 0.23339000344276428, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7566424012184143, "rewards/length2tails_reward/std": 0.23912560939788818, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7891664505004883, "rewards/thermo_reward/std": 0.225629061460495, "step": 1991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 271.40625, "completions/mean_terminated_length": 271.40625, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "entropy": 0.09022182878106833, "epoch": 3.984, "frac_reward_zero_std": 0.0, "grad_norm": 0.13608083128929138, "learning_rate": 2.0554211328364745e-07, "loss": -0.0047, "num_tokens": 17395852.0, "reward": 12.72749137878418, "reward_std": 4.599298000335693, "rewards/fitness_reward/mean": 7.022955417633057, "rewards/fitness_reward/std": 1.9133150577545166, "rewards/kidney_reward/mean": 2.4153106212615967, "rewards/kidney_reward/std": 1.0462912321090698, "rewards/length2tails_reward/mean": 0.784592866897583, "rewards/length2tails_reward/std": 0.2927846610546112, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.1107654571533203, "rewards/thermo_reward/std": 2.040642499923706, "step": 1992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 272.625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.08878266904503107, "epoch": 3.9859999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.0827256366610527, "learning_rate": 2.0476401166850477e-07, "loss": -0.0049, "num_tokens": 17404608.0, "reward": 13.730037689208984, "reward_std": 1.076958417892456, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7560330629348755, "rewards/length2tails_reward/std": 0.28175225853919983, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5941295623779297, "rewards/thermo_reward/std": 0.918302595615387, "step": 1993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.03125, "completions/mean_terminated_length": 272.03125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08085281401872635, "epoch": 3.988, "frac_reward_zero_std": 0.0, "grad_norm": 0.09590739011764526, "learning_rate": 2.0398721761913207e-07, "loss": 0.0009, "num_tokens": 17413345.0, "reward": 13.873292922973633, "reward_std": 0.3764077425003052, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7623419165611267, "rewards/length2tails_reward/std": 0.26891589164733887, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7093944549560547, "rewards/thermo_reward/std": 0.37798434495925903, "step": 1994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 273.0625, "completions/mean_terminated_length": 273.0625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.07451770780608058, "epoch": 3.99, "frac_reward_zero_std": 0.0, "grad_norm": 0.11704223603010178, "learning_rate": 2.0321173241277235e-07, "loss": 0.0013, "num_tokens": 17422115.0, "reward": 13.808777809143066, "reward_std": 0.5187250971794128, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7896385192871094, "rewards/length2tails_reward/std": 0.24748274683952332, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.15625, "completions/mean_terminated_length": 272.15625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.07666803011670709, "epoch": 3.992, "frac_reward_zero_std": 0.0, "grad_norm": 0.08989939093589783, "learning_rate": 2.0243755732451562e-07, "loss": -0.0048, "num_tokens": 17430856.0, "reward": 13.739928245544434, "reward_std": 0.8497186899185181, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.530752658843994, "rewards/kidney_reward/std": 0.5415143966674805, "rewards/length2tails_reward/mean": 0.7848218679428101, "rewards/length2tails_reward/std": 0.27701008319854736, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.669508457183838, "rewards/thermo_reward/std": 0.42886754870414734, "step": 1996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 272.625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.08358711935579777, "epoch": 3.9939999999999998, "frac_reward_zero_std": 0.0, "grad_norm": 0.08841954916715622, "learning_rate": 2.0166469362729865e-07, "loss": 0.001, "num_tokens": 17439612.0, "reward": 13.736370086669922, "reward_std": 1.049423098564148, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.7948298454284668, "rewards/length2tails_reward/std": 0.2536636292934418, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.5692219734191895, "rewards/thermo_reward/std": 1.0518332719802856, "step": 1997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 271.125, "completions/mean_terminated_length": 271.125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.08054515346884727, "epoch": 3.996, "frac_reward_zero_std": 0.0, "grad_norm": 0.12520883977413177, "learning_rate": 2.008931425919015e-07, "loss": 0.0017, "num_tokens": 17448320.0, "reward": 13.648792266845703, "reward_std": 1.0796316862106323, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7211928367614746, "rewards/length2tails_reward/std": 0.28733447194099426, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.516367197036743, "rewards/thermo_reward/std": 0.9401901960372925, "step": 1998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 272.78125, "completions/mean_terminated_length": 272.78125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08685789862647653, "epoch": 3.998, "frac_reward_zero_std": 0.0, "grad_norm": 0.06330923736095428, "learning_rate": 2.001229054869461e-07, "loss": 0.0, "num_tokens": 17457081.0, "reward": 13.92857837677002, "reward_std": 0.37837153673171997, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.599120616912842, "rewards/kidney_reward/std": 0.15476678311824799, "rewards/length2tails_reward/mean": 0.7910628318786621, "rewards/length2tails_reward/std": 0.21263666450977325, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7891664505004883, "rewards/thermo_reward/std": 0.225629061460495, "step": 1999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 273.8125, "completions/mean_terminated_length": 273.8125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.08658518642187119, "epoch": 4.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.08859028667211533, "learning_rate": 1.9935398357889389e-07, "loss": -0.0034, "num_tokens": 17465875.0, "reward": 13.957188606262207, "reward_std": 0.22562836110591888, "rewards/fitness_reward/mean": 7.361185073852539, "rewards/fitness_reward/std": 0.0, "rewards/kidney_reward/mean": 2.6264796257019043, "rewards/kidney_reward/std": 0.0, "rewards/length2tails_reward/mean": 0.8035746216773987, "rewards/length2tails_reward/std": 0.22424465417861938, "rewards/repeated_in_batch_reward/mean": 1.0, "rewards/repeated_in_batch_reward/std": 0.0, "rewards/thermo_reward/mean": 3.7891664505004883, "rewards/thermo_reward/std": 0.225629061460495, "step": 2000 } ], "logging_steps": 1, "max_steps": 2500, "num_input_tokens_seen": 17465875, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }