{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.34142802270496353, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 141.27679443359375, "completions/mean_terminated_length": 133.32432556152344, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0003414280227049635, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0458, "num_tokens": 379814.0, "reward": 0.40348002314567566, "reward_std": 0.06271512806415558, "rewards/gemini_judge_reward_func/mean": 0.109375, "rewards/gemini_judge_reward_func/std": 0.21642543375492096, "rewards/semantic_correctness_reward_func/mean": 0.42229294776916504, "rewards/semantic_correctness_reward_func/std": 0.2194633036851883, "rewards/xmlcount_reward_func/mean": 0.6881785988807678, "rewards/xmlcount_reward_func/std": 0.46016210317611694, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 137.9107208251953, "completions/mean_terminated_length": 137.9107208251953, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.000682856045409927, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0198, "num_tokens": 720774.0, "reward": 0.4278814196586609, "reward_std": 0.056292574852705, "rewards/gemini_judge_reward_func/mean": 0.1160714253783226, "rewards/gemini_judge_reward_func/std": 0.22691282629966736, "rewards/semantic_correctness_reward_func/mean": 0.39234450459480286, "rewards/semantic_correctness_reward_func/std": 0.2054908126592636, "rewards/xmlcount_reward_func/mean": 0.7574599385261536, "rewards/xmlcount_reward_func/std": 0.4265315532684326, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 126.82589721679688, "completions/mean_terminated_length": 126.82589721679688, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0010242840681148906, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0051, "num_tokens": 1091335.0, "reward": 0.41104522347450256, "reward_std": 0.05016090348362923, "rewards/gemini_judge_reward_func/mean": 0.1395089328289032, "rewards/gemini_judge_reward_func/std": 0.25786155462265015, "rewards/semantic_correctness_reward_func/mean": 0.4177080988883972, "rewards/semantic_correctness_reward_func/std": 0.2196023315191269, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 156.24554443359375, "completions/mean_terminated_length": 148.4279327392578, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.001365712090819854, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0358, "num_tokens": 1433778.0, "reward": 0.4309219717979431, "reward_std": 0.07280989736318588, "rewards/gemini_judge_reward_func/mean": 0.1127232164144516, "rewards/gemini_judge_reward_func/std": 0.22798825800418854, "rewards/semantic_correctness_reward_func/mean": 0.4097882807254791, "rewards/semantic_correctness_reward_func/std": 0.2280801683664322, "rewards/xmlcount_reward_func/mean": 0.7596875429153442, "rewards/xmlcount_reward_func/std": 0.4291202127933502, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 151.125, "completions/mean_terminated_length": 151.125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0017071401135248176, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0083, "num_tokens": 1800166.0, "reward": 0.44322600960731506, "reward_std": 0.060560885816812515, "rewards/gemini_judge_reward_func/mean": 0.1316964328289032, "rewards/gemini_judge_reward_func/std": 0.25926730036735535, "rewards/semantic_correctness_reward_func/mean": 0.4244246482849121, "rewards/semantic_correctness_reward_func/std": 0.21751059591770172, "rewards/xmlcount_reward_func/mean": 0.7641563415527344, "rewards/xmlcount_reward_func/std": 0.4263768792152405, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 157.68304443359375, "completions/mean_terminated_length": 145.92308044433594, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.002048568136229781, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0095, "num_tokens": 2137347.0, "reward": 0.4433988034725189, "reward_std": 0.07498325407505035, "rewards/gemini_judge_reward_func/mean": 0.1450892835855484, "rewards/gemini_judge_reward_func/std": 0.27536848187446594, "rewards/semantic_correctness_reward_func/mean": 0.42084214091300964, "rewards/semantic_correctness_reward_func/std": 0.23131638765335083, "rewards/xmlcount_reward_func/mean": 0.7529866099357605, "rewards/xmlcount_reward_func/std": 0.4318158030509949, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 131.30804443359375, "completions/mean_terminated_length": 127.30493927001953, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0023899961589347444, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0101, "num_tokens": 2528988.0, "reward": 0.3721596896648407, "reward_std": 0.06552143394947052, "rewards/gemini_judge_reward_func/mean": 0.1439732164144516, "rewards/gemini_judge_reward_func/std": 0.25538474321365356, "rewards/semantic_correctness_reward_func/mean": 0.4288518726825714, "rewards/semantic_correctness_reward_func/std": 0.22205649316310883, "rewards/xmlcount_reward_func/mean": 0.5720000267028809, "rewards/xmlcount_reward_func/std": 0.4964759945869446, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 132.20089721679688, "completions/mean_terminated_length": 132.20089721679688, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.002731424181639708, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0009, "num_tokens": 2903613.0, "reward": 0.4032416045665741, "reward_std": 0.06853938102722168, "rewards/gemini_judge_reward_func/mean": 0.1294642835855484, "rewards/gemini_judge_reward_func/std": 0.24140411615371704, "rewards/semantic_correctness_reward_func/mean": 0.43676143884658813, "rewards/semantic_correctness_reward_func/std": 0.22384540736675262, "rewards/xmlcount_reward_func/mean": 0.660258948802948, "rewards/xmlcount_reward_func/std": 0.47449371218681335, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 141.95982360839844, "completions/mean_terminated_length": 138.00448608398438, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0030728522043446714, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0041, "num_tokens": 3262084.0, "reward": 0.4193665385246277, "reward_std": 0.06024722009897232, "rewards/gemini_judge_reward_func/mean": 0.0904017835855484, "rewards/gemini_judge_reward_func/std": 0.2089034467935562, "rewards/semantic_correctness_reward_func/mean": 0.4145289957523346, "rewards/semantic_correctness_reward_func/std": 0.1940658688545227, "rewards/xmlcount_reward_func/mean": 0.7507500648498535, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 151.4866180419922, "completions/mean_terminated_length": 135.6227264404297, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.003414280227049635, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0118, "num_tokens": 3636097.0, "reward": 0.3932708203792572, "reward_std": 0.0754866749048233, "rewards/gemini_judge_reward_func/mean": 0.15625, "rewards/gemini_judge_reward_func/std": 0.2853386700153351, "rewards/semantic_correctness_reward_func/mean": 0.43835392594337463, "rewards/semantic_correctness_reward_func/std": 0.2281491756439209, "rewards/xmlcount_reward_func/mean": 0.6077500581741333, "rewards/xmlcount_reward_func/std": 0.48996880650520325, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 130.58482360839844, "completions/mean_terminated_length": 130.58482360839844, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0037557082497545985, "grad_norm": 0.039375144988298416, "kl": 0.0, "learning_rate": 2.0000000000000002e-07, "loss": -0.0231, "num_tokens": 3973920.0, "reward": 0.4216010272502899, "reward_std": 0.07699327915906906, "rewards/gemini_judge_reward_func/mean": 0.1171875, "rewards/gemini_judge_reward_func/std": 0.26352497935295105, "rewards/semantic_correctness_reward_func/mean": 0.4078800678253174, "rewards/semantic_correctness_reward_func/std": 0.21701829135417938, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 163.13839721679688, "completions/mean_terminated_length": 143.4840087890625, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.004097136272459562, "grad_norm": 0.027680950239300728, "kl": 0.0, "learning_rate": 4.0000000000000003e-07, "loss": -0.0192, "num_tokens": 4325419.0, "reward": 0.38460925221443176, "reward_std": 0.05896308273077011, "rewards/gemini_judge_reward_func/mean": 0.0959821417927742, "rewards/gemini_judge_reward_func/std": 0.2214544713497162, "rewards/semantic_correctness_reward_func/mean": 0.37258180975914, "rewards/semantic_correctness_reward_func/std": 0.18282517790794373, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853896975517273, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 151.97769165039062, "completions/mean_terminated_length": 151.97769165039062, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0044385642951645255, "grad_norm": 0.02707161009311676, "kl": 1.8555670976638794e-05, "learning_rate": 6.000000000000001e-07, "loss": -0.0014, "num_tokens": 4651218.0, "reward": 0.4828442335128784, "reward_std": 0.07185830920934677, "rewards/gemini_judge_reward_func/mean": 0.1149553582072258, "rewards/gemini_judge_reward_func/std": 0.22932343184947968, "rewards/semantic_correctness_reward_func/mean": 0.43256044387817383, "rewards/semantic_correctness_reward_func/std": 0.20364603400230408, "rewards/xmlcount_reward_func/mean": 0.8758750557899475, "rewards/xmlcount_reward_func/std": 0.33179107308387756, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 152.73214721679688, "completions/mean_terminated_length": 144.8828887939453, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.004779992317869489, "grad_norm": 0.0345768965780735, "kl": 1.5214085578918457e-05, "learning_rate": 8.000000000000001e-07, "loss": -0.015, "num_tokens": 4993970.0, "reward": 0.42273515462875366, "reward_std": 0.06252222508192062, "rewards/gemini_judge_reward_func/mean": 0.09375, "rewards/gemini_judge_reward_func/std": 0.1822412759065628, "rewards/semantic_correctness_reward_func/mean": 0.4202113747596741, "rewards/semantic_correctness_reward_func/std": 0.2077675461769104, "rewards/xmlcount_reward_func/mean": 0.7529821395874023, "rewards/xmlcount_reward_func/std": 0.43116891384124756, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 134.96429443359375, "completions/mean_terminated_length": 130.9775848388672, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.005121420340574453, "grad_norm": 0.027755815535783768, "kl": 1.3284385204315186e-05, "learning_rate": 1.0000000000000002e-06, "loss": -0.0036, "num_tokens": 5346350.0, "reward": 0.4308743476867676, "reward_std": 0.06485090404748917, "rewards/gemini_judge_reward_func/mean": 0.1261160671710968, "rewards/gemini_judge_reward_func/std": 0.22584888339042664, "rewards/semantic_correctness_reward_func/mean": 0.4319072663784027, "rewards/semantic_correctness_reward_func/std": 0.19891038537025452, "rewards/xmlcount_reward_func/mean": 0.7351161241531372, "rewards/xmlcount_reward_func/std": 0.44118446111679077, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 150.21429443359375, "completions/mean_terminated_length": 146.2959747314453, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.005462848363279416, "grad_norm": 0.027755815535783768, "kl": 1.9583851099014282e-05, "learning_rate": 1.0000000000000002e-06, "loss": -0.0088, "num_tokens": 5673342.0, "reward": 0.4351733922958374, "reward_std": 0.06109142303466797, "rewards/gemini_judge_reward_func/mean": 0.1183035746216774, "rewards/gemini_judge_reward_func/std": 0.21817582845687866, "rewards/semantic_correctness_reward_func/mean": 0.437759667634964, "rewards/semantic_correctness_reward_func/std": 0.19619369506835938, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 144.22769165039062, "completions/mean_terminated_length": 136.3018035888672, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.00580427638598438, "grad_norm": 0.035458628088235855, "kl": 1.4953315258026123e-05, "learning_rate": 1.2000000000000002e-06, "loss": -0.0095, "num_tokens": 6053241.0, "reward": 0.4043574631214142, "reward_std": 0.07264947146177292, "rewards/gemini_judge_reward_func/mean": 0.1328125, "rewards/gemini_judge_reward_func/std": 0.24477799236774445, "rewards/semantic_correctness_reward_func/mean": 0.41257286071777344, "rewards/semantic_correctness_reward_func/std": 0.22099269926548004, "rewards/xmlcount_reward_func/mean": 0.671794593334198, "rewards/xmlcount_reward_func/std": 0.46925294399261475, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 145.47769165039062, "completions/mean_terminated_length": 141.53811645507812, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.006145704408689343, "grad_norm": 0.03304464370012283, "kl": 1.3154000043869019e-05, "learning_rate": 1.4000000000000001e-06, "loss": -0.0373, "num_tokens": 6402224.0, "reward": 0.4680355191230774, "reward_std": 0.06361004710197449, "rewards/gemini_judge_reward_func/mean": 0.1517857164144516, "rewards/gemini_judge_reward_func/std": 0.29065632820129395, "rewards/semantic_correctness_reward_func/mean": 0.4278559684753418, "rewards/semantic_correctness_reward_func/std": 0.2081516683101654, "rewards/xmlcount_reward_func/mean": 0.8043750524520874, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 147.27679443359375, "completions/mean_terminated_length": 143.3452911376953, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.006487132431394307, "grad_norm": 0.031187007203698158, "kl": 1.9073486328125e-05, "learning_rate": 1.6000000000000001e-06, "loss": -0.0048, "num_tokens": 6735918.0, "reward": 0.4452937841415405, "reward_std": 0.07176318019628525, "rewards/gemini_judge_reward_func/mean": 0.1116071417927742, "rewards/gemini_judge_reward_func/std": 0.22669215500354767, "rewards/semantic_correctness_reward_func/mean": 0.38559380173683167, "rewards/semantic_correctness_reward_func/std": 0.2231680005788803, "rewards/xmlcount_reward_func/mean": 0.8088303804397583, "rewards/xmlcount_reward_func/std": 0.3893822133541107, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 157.04019165039062, "completions/mean_terminated_length": 141.2772674560547, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.00682856045409927, "grad_norm": 0.0283343568444252, "kl": 1.8853694200515747e-05, "learning_rate": 1.8000000000000001e-06, "loss": -0.0294, "num_tokens": 7075515.0, "reward": 0.413688063621521, "reward_std": 0.05096305534243584, "rewards/gemini_judge_reward_func/mean": 0.0814732164144516, "rewards/gemini_judge_reward_func/std": 0.19466669857501984, "rewards/semantic_correctness_reward_func/mean": 0.4039938151836395, "rewards/semantic_correctness_reward_func/std": 0.18301716446876526, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 141.3303680419922, "completions/mean_terminated_length": 133.37838745117188, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.007169988476804234, "grad_norm": 0.030031763017177582, "kl": 3.3717602491378784e-05, "learning_rate": 2.0000000000000003e-06, "loss": -0.0204, "num_tokens": 7425837.0, "reward": 0.38512933254241943, "reward_std": 0.07402481883764267, "rewards/gemini_judge_reward_func/mean": 0.1216517835855484, "rewards/gemini_judge_reward_func/std": 0.2437330186367035, "rewards/semantic_correctness_reward_func/mean": 0.36952146887779236, "rewards/semantic_correctness_reward_func/std": 0.2271348237991333, "rewards/xmlcount_reward_func/mean": 0.6564107537269592, "rewards/xmlcount_reward_func/std": 0.5066681504249573, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 142.75894165039062, "completions/mean_terminated_length": 138.8071746826172, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.007511416499509197, "grad_norm": 0.028901347890496254, "kl": 2.1755695343017578e-05, "learning_rate": 2.2e-06, "loss": 0.0354, "num_tokens": 7767839.0, "reward": 0.42108339071273804, "reward_std": 0.06260724365711212, "rewards/gemini_judge_reward_func/mean": 0.1082589253783226, "rewards/gemini_judge_reward_func/std": 0.2214629352092743, "rewards/semantic_correctness_reward_func/mean": 0.41428306698799133, "rewards/semantic_correctness_reward_func/std": 0.20763908326625824, "rewards/xmlcount_reward_func/mean": 0.7373080849647522, "rewards/xmlcount_reward_func/std": 0.43889865279197693, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 150.8303680419922, "completions/mean_terminated_length": 142.96397399902344, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.00785284452221416, "grad_norm": 0.026742972433567047, "kl": 2.8714537620544434e-05, "learning_rate": 2.4000000000000003e-06, "loss": -0.0139, "num_tokens": 8112905.0, "reward": 0.43243101239204407, "reward_std": 0.05913807824254036, "rewards/gemini_judge_reward_func/mean": 0.0870535746216774, "rewards/gemini_judge_reward_func/std": 0.17304261028766632, "rewards/semantic_correctness_reward_func/mean": 0.42398518323898315, "rewards/semantic_correctness_reward_func/std": 0.1769956350326538, "rewards/xmlcount_reward_func/mean": 0.7820313572883606, "rewards/xmlcount_reward_func/std": 0.41473883390426636, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 154.7232208251953, "completions/mean_terminated_length": 146.8918914794922, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.008194272544919124, "grad_norm": 0.03460094705224037, "kl": 7.3261559009552e-05, "learning_rate": 2.6e-06, "loss": -0.006, "num_tokens": 8467131.0, "reward": 0.3985432982444763, "reward_std": 0.05673561245203018, "rewards/gemini_judge_reward_func/mean": 0.1015625, "rewards/gemini_judge_reward_func/std": 0.21178245544433594, "rewards/semantic_correctness_reward_func/mean": 0.39980557560920715, "rewards/semantic_correctness_reward_func/std": 0.2231663316488266, "rewards/xmlcount_reward_func/mean": 0.6948928833007812, "rewards/xmlcount_reward_func/std": 0.4610230326652527, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 146.30357360839844, "completions/mean_terminated_length": 138.39639282226562, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.008535700567624089, "grad_norm": 0.03418440371751785, "kl": 0.00016423314809799194, "learning_rate": 2.8000000000000003e-06, "loss": -0.0237, "num_tokens": 8838515.0, "reward": 0.39606812596321106, "reward_std": 0.07865350693464279, "rewards/gemini_judge_reward_func/mean": 0.1551339328289032, "rewards/gemini_judge_reward_func/std": 0.2912905812263489, "rewards/semantic_correctness_reward_func/mean": 0.39453673362731934, "rewards/semantic_correctness_reward_func/std": 0.22986909747123718, "rewards/xmlcount_reward_func/mean": 0.6377679109573364, "rewards/xmlcount_reward_func/std": 0.48144102096557617, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 155.25, "completions/mean_terminated_length": 155.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.008877128590329051, "grad_norm": 0.025704992935061455, "kl": 0.00011079013347625732, "learning_rate": 3e-06, "loss": -0.011, "num_tokens": 9173275.0, "reward": 0.45401424169540405, "reward_std": 0.06545478105545044, "rewards/gemini_judge_reward_func/mean": 0.1071428582072258, "rewards/gemini_judge_reward_func/std": 0.2276175171136856, "rewards/semantic_correctness_reward_func/mean": 0.4121246933937073, "rewards/semantic_correctness_reward_func/std": 0.19307947158813477, "rewards/xmlcount_reward_func/mean": 0.8218303918838501, "rewards/xmlcount_reward_func/std": 0.3807325065135956, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 157.77679443359375, "completions/mean_terminated_length": 142.0272674560547, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.009218556613034015, "grad_norm": 0.03233994543552399, "kl": 0.00011872127652168274, "learning_rate": 3.2000000000000003e-06, "loss": -0.0147, "num_tokens": 9514313.0, "reward": 0.41901880502700806, "reward_std": 0.05425465106964111, "rewards/gemini_judge_reward_func/mean": 0.1004464253783226, "rewards/gemini_judge_reward_func/std": 0.2269900143146515, "rewards/semantic_correctness_reward_func/mean": 0.3894062638282776, "rewards/semantic_correctness_reward_func/std": 0.21159562468528748, "rewards/xmlcount_reward_func/mean": 0.7523974180221558, "rewards/xmlcount_reward_func/std": 0.43225109577178955, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 143.82589721679688, "completions/mean_terminated_length": 139.87893676757812, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.009559984635738978, "grad_norm": 0.030443880707025528, "kl": 0.00025102123618125916, "learning_rate": 3.4000000000000005e-06, "loss": 0.0046, "num_tokens": 9857054.0, "reward": 0.4207466244697571, "reward_std": 0.04802559316158295, "rewards/gemini_judge_reward_func/mean": 0.0792410746216774, "rewards/gemini_judge_reward_func/std": 0.16614827513694763, "rewards/semantic_correctness_reward_func/mean": 0.37225088477134705, "rewards/semantic_correctness_reward_func/std": 0.17117980122566223, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 128.8794708251953, "completions/mean_terminated_length": 128.8794708251953, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.009901412658443942, "grad_norm": 0.039184801280498505, "kl": 0.0005348548293113708, "learning_rate": 3.6000000000000003e-06, "loss": -0.0665, "num_tokens": 10256135.0, "reward": 0.3793807625770569, "reward_std": 0.05964759737253189, "rewards/gemini_judge_reward_func/mean": 0.1339285671710968, "rewards/gemini_judge_reward_func/std": 0.2697960138320923, "rewards/semantic_correctness_reward_func/mean": 0.41354653239250183, "rewards/semantic_correctness_reward_func/std": 0.2386574149131775, "rewards/xmlcount_reward_func/mean": 0.6077500581741333, "rewards/xmlcount_reward_func/std": 0.48996880650520325, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 147.1607208251953, "completions/mean_terminated_length": 143.2287139892578, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.010242840681148906, "grad_norm": 0.026800105348229408, "kl": 0.00027988851070404053, "learning_rate": 3.8000000000000005e-06, "loss": -0.0072, "num_tokens": 10580663.0, "reward": 0.45957663655281067, "reward_std": 0.06587394326925278, "rewards/gemini_judge_reward_func/mean": 0.1026785746216774, "rewards/gemini_judge_reward_func/std": 0.20789778232574463, "rewards/semantic_correctness_reward_func/mean": 0.41227564215660095, "rewards/semantic_correctness_reward_func/std": 0.19964328408241272, "rewards/xmlcount_reward_func/mean": 0.8401250839233398, "rewards/xmlcount_reward_func/std": 0.3684578835964203, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 137.16519165039062, "completions/mean_terminated_length": 133.1883544921875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.010584268703853868, "grad_norm": 0.028585907071828842, "kl": 0.0008361563086509705, "learning_rate": 4.000000000000001e-06, "loss": -0.0323, "num_tokens": 10931200.0, "reward": 0.40794217586517334, "reward_std": 0.060390252619981766, "rewards/gemini_judge_reward_func/mean": 0.1194196417927742, "rewards/gemini_judge_reward_func/std": 0.2207612693309784, "rewards/semantic_correctness_reward_func/mean": 0.44237130880355835, "rewards/semantic_correctness_reward_func/std": 0.18124501407146454, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 139.8303680419922, "completions/mean_terminated_length": 135.865478515625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.010925696726558833, "grad_norm": 0.029551656916737556, "kl": 0.0011077597737312317, "learning_rate": 4.2000000000000004e-06, "loss": -0.0086, "num_tokens": 11283158.0, "reward": 0.3982951045036316, "reward_std": 0.051123134791851044, "rewards/gemini_judge_reward_func/mean": 0.1116071417927742, "rewards/gemini_judge_reward_func/std": 0.2375587821006775, "rewards/semantic_correctness_reward_func/mean": 0.4276362359523773, "rewards/semantic_correctness_reward_func/std": 0.23077288269996643, "rewards/xmlcount_reward_func/mean": 0.6703125238418579, "rewards/xmlcount_reward_func/std": 0.4718664884567261, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 132.13839721679688, "completions/mean_terminated_length": 132.13839721679688, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.011267124749263795, "grad_norm": 0.02802436798810959, "kl": 0.001424439251422882, "learning_rate": 4.4e-06, "loss": -0.0199, "num_tokens": 11632649.0, "reward": 0.4111942648887634, "reward_std": 0.05410204827785492, "rewards/gemini_judge_reward_func/mean": 0.0959821417927742, "rewards/gemini_judge_reward_func/std": 0.20018360018730164, "rewards/semantic_correctness_reward_func/mean": 0.3969176113605499, "rewards/semantic_correctness_reward_func/std": 0.1971798837184906, "rewards/xmlcount_reward_func/mean": 0.733544647693634, "rewards/xmlcount_reward_func/std": 0.44044601917266846, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 149.41519165039062, "completions/mean_terminated_length": 145.4932861328125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.01160855277196876, "grad_norm": 0.02668035216629505, "kl": 0.0008018910884857178, "learning_rate": 4.600000000000001e-06, "loss": -0.0001, "num_tokens": 11953490.0, "reward": 0.4242376387119293, "reward_std": 0.05728016048669815, "rewards/gemini_judge_reward_func/mean": 0.0982142835855484, "rewards/gemini_judge_reward_func/std": 0.20601151883602142, "rewards/semantic_correctness_reward_func/mean": 0.38750943541526794, "rewards/semantic_correctness_reward_func/std": 0.18285952508449554, "rewards/xmlcount_reward_func/mean": 0.7686249613761902, "rewards/xmlcount_reward_func/std": 0.42356839776039124, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 147.875, "completions/mean_terminated_length": 139.9819793701172, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.011949980794673723, "grad_norm": 0.029554614797234535, "kl": 0.0027062706649303436, "learning_rate": 4.800000000000001e-06, "loss": -0.0146, "num_tokens": 12302866.0, "reward": 0.47618868947029114, "reward_std": 0.05818319693207741, "rewards/gemini_judge_reward_func/mean": 0.1595982164144516, "rewards/gemini_judge_reward_func/std": 0.29934054613113403, "rewards/semantic_correctness_reward_func/mean": 0.41724681854248047, "rewards/semantic_correctness_reward_func/std": 0.2225155085325241, "rewards/xmlcount_reward_func/mean": 0.8222500085830688, "rewards/xmlcount_reward_func/std": 0.3842346966266632, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 146.45089721679688, "completions/mean_terminated_length": 134.53846740722656, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.012291408817378686, "grad_norm": 0.028692543506622314, "kl": 0.0030185282230377197, "learning_rate": 5e-06, "loss": -0.0039, "num_tokens": 12669655.0, "reward": 0.3967846930027008, "reward_std": 0.05047953501343727, "rewards/gemini_judge_reward_func/mean": 0.1082589253783226, "rewards/gemini_judge_reward_func/std": 0.23257403075695038, "rewards/semantic_correctness_reward_func/mean": 0.4089055061340332, "rewards/semantic_correctness_reward_func/std": 0.21490508317947388, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 149.4241180419922, "completions/mean_terminated_length": 137.5520477294922, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.01263283684008365, "grad_norm": 0.030691703781485558, "kl": 0.002063453197479248, "learning_rate": 5.2e-06, "loss": -0.0337, "num_tokens": 13029822.0, "reward": 0.42319977283477783, "reward_std": 0.06844579428434372, "rewards/gemini_judge_reward_func/mean": 0.1060267835855484, "rewards/gemini_judge_reward_func/std": 0.23599198460578918, "rewards/semantic_correctness_reward_func/mean": 0.4158558249473572, "rewards/semantic_correctness_reward_func/std": 0.20988070964813232, "rewards/xmlcount_reward_func/mean": 0.7440447211265564, "rewards/xmlcount_reward_func/std": 0.43694427609443665, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 159.00894165039062, "completions/mean_terminated_length": 147.2669677734375, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.012974264862788614, "grad_norm": 0.027180153876543045, "kl": 0.002780407667160034, "learning_rate": 5.400000000000001e-06, "loss": -0.0143, "num_tokens": 13386520.0, "reward": 0.4280446171760559, "reward_std": 0.07069174945354462, "rewards/gemini_judge_reward_func/mean": 0.1428571492433548, "rewards/gemini_judge_reward_func/std": 0.28156015276908875, "rewards/semantic_correctness_reward_func/mean": 0.4245087206363678, "rewards/semantic_correctness_reward_func/std": 0.21585093438625336, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 152.5803680419922, "completions/mean_terminated_length": 144.729736328125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.013315692885493577, "grad_norm": 0.02773498371243477, "kl": 0.00479482114315033, "learning_rate": 5.600000000000001e-06, "loss": -0.0215, "num_tokens": 13759430.0, "reward": 0.37968793511390686, "reward_std": 0.055167291313409805, "rewards/gemini_judge_reward_func/mean": 0.0904017835855484, "rewards/gemini_judge_reward_func/std": 0.1978796124458313, "rewards/semantic_correctness_reward_func/mean": 0.3948860466480255, "rewards/semantic_correctness_reward_func/std": 0.18360565602779388, "rewards/xmlcount_reward_func/mean": 0.6613750457763672, "rewards/xmlcount_reward_func/std": 0.47500187158584595, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 147.25, "completions/mean_terminated_length": 135.34841918945312, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.01365712090819854, "grad_norm": 0.025785459205508232, "kl": 0.006071865558624268, "learning_rate": 5.8e-06, "loss": -0.0544, "num_tokens": 14144086.0, "reward": 0.3708299696445465, "reward_std": 0.06480063498020172, "rewards/gemini_judge_reward_func/mean": 0.0915178582072258, "rewards/gemini_judge_reward_func/std": 0.21174997091293335, "rewards/semantic_correctness_reward_func/mean": 0.4064801037311554, "rewards/semantic_correctness_reward_func/std": 0.1985797882080078, "rewards/xmlcount_reward_func/mean": 0.6323170065879822, "rewards/xmlcount_reward_func/std": 0.48041653633117676, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 146.66964721679688, "completions/mean_terminated_length": 138.76576232910156, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.013998548930903503, "grad_norm": 0.025612158700823784, "kl": 0.0058727264404296875, "learning_rate": 6e-06, "loss": -0.0175, "num_tokens": 14512272.0, "reward": 0.39734622836112976, "reward_std": 0.0646364837884903, "rewards/gemini_judge_reward_func/mean": 0.1305803507566452, "rewards/gemini_judge_reward_func/std": 0.24711813032627106, "rewards/semantic_correctness_reward_func/mean": 0.43632930517196655, "rewards/semantic_correctness_reward_func/std": 0.19070696830749512, "rewards/xmlcount_reward_func/mean": 0.6446205973625183, "rewards/xmlcount_reward_func/std": 0.479495108127594, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 137.9866180419922, "completions/mean_terminated_length": 130.00450134277344, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.014339976953608467, "grad_norm": 0.029739174991846085, "kl": 0.007889151573181152, "learning_rate": 6.200000000000001e-06, "loss": -0.0672, "num_tokens": 14863201.0, "reward": 0.3775990903377533, "reward_std": 0.05630933493375778, "rewards/gemini_judge_reward_func/mean": 0.0814732164144516, "rewards/gemini_judge_reward_func/std": 0.20312152802944183, "rewards/semantic_correctness_reward_func/mean": 0.3665488064289093, "rewards/semantic_correctness_reward_func/std": 0.19233566522598267, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 880.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 143.08929443359375, "completions/mean_terminated_length": 143.08929443359375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.014681404976313431, "grad_norm": 0.02772599086165428, "kl": 0.004380345344543457, "learning_rate": 6.4000000000000006e-06, "loss": -0.0156, "num_tokens": 15217021.0, "reward": 0.41600289940834045, "reward_std": 0.06771310418844223, "rewards/gemini_judge_reward_func/mean": 0.1149553582072258, "rewards/gemini_judge_reward_func/std": 0.21152234077453613, "rewards/semantic_correctness_reward_func/mean": 0.42010369896888733, "rewards/semantic_correctness_reward_func/std": 0.18937553465366364, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 134.53125, "completions/mean_terminated_length": 130.5426025390625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.015022832999018394, "grad_norm": 0.025546282529830933, "kl": 0.0051773786544799805, "learning_rate": 6.600000000000001e-06, "loss": 0.0141, "num_tokens": 15581232.0, "reward": 0.3940742611885071, "reward_std": 0.06104440987110138, "rewards/gemini_judge_reward_func/mean": 0.125, "rewards/gemini_judge_reward_func/std": 0.2593541443347931, "rewards/semantic_correctness_reward_func/mean": 0.45124611258506775, "rewards/semantic_correctness_reward_func/std": 0.2126028686761856, "rewards/xmlcount_reward_func/mean": 0.6345625519752502, "rewards/xmlcount_reward_func/std": 0.48329102993011475, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 140.85714721679688, "completions/mean_terminated_length": 128.86878967285156, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.015364261021723358, "grad_norm": 0.027798650786280632, "kl": 0.006904497742652893, "learning_rate": 6.800000000000001e-06, "loss": 0.0193, "num_tokens": 15928136.0, "reward": 0.40762490034103394, "reward_std": 0.06153449788689613, "rewards/gemini_judge_reward_func/mean": 0.1171875, "rewards/gemini_judge_reward_func/std": 0.21424943208694458, "rewards/semantic_correctness_reward_func/mean": 0.4541868567466736, "rewards/semantic_correctness_reward_func/std": 0.20098300278186798, "rewards/xmlcount_reward_func/mean": 0.6747812628746033, "rewards/xmlcount_reward_func/std": 0.4702269732952118, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 148.0803680419922, "completions/mean_terminated_length": 132.154541015625, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.01570568904442832, "grad_norm": 0.028658276423811913, "kl": 0.006076395511627197, "learning_rate": 7e-06, "loss": 0.016, "num_tokens": 16311014.0, "reward": 0.39668479561805725, "reward_std": 0.063643679022789, "rewards/gemini_judge_reward_func/mean": 0.1138392835855484, "rewards/gemini_judge_reward_func/std": 0.23170354962348938, "rewards/semantic_correctness_reward_func/mean": 0.4329952597618103, "rewards/semantic_correctness_reward_func/std": 0.2143063247203827, "rewards/xmlcount_reward_func/mean": 0.6613750457763672, "rewards/xmlcount_reward_func/std": 0.47500187158584595, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 143.875, "completions/mean_terminated_length": 139.92825317382812, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.016047117067133285, "grad_norm": 0.027143213897943497, "kl": 0.0036936402320861816, "learning_rate": 7.2000000000000005e-06, "loss": -0.0194, "num_tokens": 16672714.0, "reward": 0.4406120777130127, "reward_std": 0.0816921517252922, "rewards/gemini_judge_reward_func/mean": 0.1205357164144516, "rewards/gemini_judge_reward_func/std": 0.24939869344234467, "rewards/semantic_correctness_reward_func/mean": 0.44264957308769226, "rewards/semantic_correctness_reward_func/std": 0.22831664979457855, "rewards/xmlcount_reward_func/mean": 0.7596697211265564, "rewards/xmlcount_reward_func/std": 0.42911025881767273, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 133.13394165039062, "completions/mean_terminated_length": 125.10810852050781, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.01638854508983825, "grad_norm": 0.02828553505241871, "kl": 0.011866092681884766, "learning_rate": 7.4e-06, "loss": -0.0569, "num_tokens": 17044560.0, "reward": 0.3188920021057129, "reward_std": 0.03988515958189964, "rewards/gemini_judge_reward_func/mean": 0.0535714291036129, "rewards/gemini_judge_reward_func/std": 0.1862076371908188, "rewards/semantic_correctness_reward_func/mean": 0.3433171808719635, "rewards/semantic_correctness_reward_func/std": 0.18960954248905182, "rewards/xmlcount_reward_func/mean": 0.5720000267028809, "rewards/xmlcount_reward_func/std": 0.4964759945869446, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 143.87054443359375, "completions/mean_terminated_length": 131.92308044433594, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.016729973112543213, "grad_norm": 0.028787607327103615, "kl": 0.010211586952209473, "learning_rate": 7.600000000000001e-06, "loss": -0.0146, "num_tokens": 17410671.0, "reward": 0.38920655846595764, "reward_std": 0.07314638048410416, "rewards/gemini_judge_reward_func/mean": 0.1205357164144516, "rewards/gemini_judge_reward_func/std": 0.2295006811618805, "rewards/semantic_correctness_reward_func/mean": 0.39355048537254333, "rewards/semantic_correctness_reward_func/std": 0.2066669762134552, "rewards/xmlcount_reward_func/mean": 0.6557053923606873, "rewards/xmlcount_reward_func/std": 0.510606586933136, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 140.32144165039062, "completions/mean_terminated_length": 140.32144165039062, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.017071401135248177, "grad_norm": 0.026758279651403427, "kl": 0.0061858296394348145, "learning_rate": 7.800000000000002e-06, "loss": -0.0145, "num_tokens": 17774383.0, "reward": 0.4114672839641571, "reward_std": 0.06060100719332695, "rewards/gemini_judge_reward_func/mean": 0.0904017835855484, "rewards/gemini_judge_reward_func/std": 0.20346620678901672, "rewards/semantic_correctness_reward_func/mean": 0.41078296303749084, "rewards/semantic_correctness_reward_func/std": 0.2236628234386444, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 162.0803680419922, "completions/mean_terminated_length": 154.31532287597656, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.017412829157953138, "grad_norm": 0.02499496378004551, "kl": 0.0020468831062316895, "learning_rate": 8.000000000000001e-06, "loss": 0.0036, "num_tokens": 18123241.0, "reward": 0.4477907717227936, "reward_std": 0.06217062473297119, "rewards/gemini_judge_reward_func/mean": 0.1037946417927742, "rewards/gemini_judge_reward_func/std": 0.22855830192565918, "rewards/semantic_correctness_reward_func/mean": 0.4226144850254059, "rewards/semantic_correctness_reward_func/std": 0.2091536521911621, "rewards/xmlcount_reward_func/mean": 0.8043751120567322, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 145.8169708251953, "completions/mean_terminated_length": 145.8169708251953, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.017754257180658102, "grad_norm": 0.025865867733955383, "kl": 0.00410914421081543, "learning_rate": 8.2e-06, "loss": -0.0136, "num_tokens": 18445812.0, "reward": 0.4681364595890045, "reward_std": 0.07395092397928238, "rewards/gemini_judge_reward_func/mean": 0.1305803507566452, "rewards/gemini_judge_reward_func/std": 0.26884591579437256, "rewards/semantic_correctness_reward_func/mean": 0.4350215494632721, "rewards/semantic_correctness_reward_func/std": 0.2260739952325821, "rewards/xmlcount_reward_func/mean": 0.8222500085830688, "rewards/xmlcount_reward_func/std": 0.3842346966266632, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 154.15179443359375, "completions/mean_terminated_length": 138.33636474609375, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.018095685203363066, "grad_norm": 0.026150401681661606, "kl": 0.005943477153778076, "learning_rate": 8.400000000000001e-06, "loss": 0.0209, "num_tokens": 18803846.0, "reward": 0.45366746187210083, "reward_std": 0.07954549789428711, "rewards/gemini_judge_reward_func/mean": 0.1729910671710968, "rewards/gemini_judge_reward_func/std": 0.3112905025482178, "rewards/semantic_correctness_reward_func/mean": 0.4566049575805664, "rewards/semantic_correctness_reward_func/std": 0.2221759408712387, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 139.625, "completions/mean_terminated_length": 135.65919494628906, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.01843711322606803, "grad_norm": 0.024718625470995903, "kl": 0.006244301795959473, "learning_rate": 8.6e-06, "loss": 0.0007, "num_tokens": 19149074.0, "reward": 0.44792598485946655, "reward_std": 0.06949326395988464, "rewards/gemini_judge_reward_func/mean": 0.1495535671710968, "rewards/gemini_judge_reward_func/std": 0.25926730036735535, "rewards/semantic_correctness_reward_func/mean": 0.44349589943885803, "rewards/semantic_correctness_reward_func/std": 0.20944607257843018, "rewards/xmlcount_reward_func/mean": 0.748513400554657, "rewards/xmlcount_reward_func/std": 0.43441200256347656, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 165.9241180419922, "completions/mean_terminated_length": 146.3333282470703, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.018778541248772995, "grad_norm": 0.024257266893982887, "kl": 0.009581208229064941, "learning_rate": 8.8e-06, "loss": -0.0089, "num_tokens": 19531933.0, "reward": 0.403461754322052, "reward_std": 0.06676840037107468, "rewards/gemini_judge_reward_func/mean": 0.1283482164144516, "rewards/gemini_judge_reward_func/std": 0.27608317136764526, "rewards/semantic_correctness_reward_func/mean": 0.4199782907962799, "rewards/semantic_correctness_reward_func/std": 0.25535663962364197, "rewards/xmlcount_reward_func/mean": 0.6703169941902161, "rewards/xmlcount_reward_func/std": 0.46707943081855774, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 162.83929443359375, "completions/mean_terminated_length": 155.08108520507812, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.019119969271477955, "grad_norm": 0.024257266893982887, "kl": 0.006259918212890625, "learning_rate": 8.8e-06, "loss": -0.0184, "num_tokens": 19886317.0, "reward": 0.4279636740684509, "reward_std": 0.06266574561595917, "rewards/gemini_judge_reward_func/mean": 0.0926339253783226, "rewards/gemini_judge_reward_func/std": 0.19683989882469177, "rewards/semantic_correctness_reward_func/mean": 0.4396754205226898, "rewards/semantic_correctness_reward_func/std": 0.2028336226940155, "rewards/xmlcount_reward_func/mean": 0.7574375867843628, "rewards/xmlcount_reward_func/std": 0.42914968729019165, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 137.7053680419922, "completions/mean_terminated_length": 133.73094177246094, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.01946139729418292, "grad_norm": 0.027076715603470802, "kl": 0.01098167896270752, "learning_rate": 9e-06, "loss": 0.0159, "num_tokens": 20248451.0, "reward": 0.3951142132282257, "reward_std": 0.06582393497228622, "rewards/gemini_judge_reward_func/mean": 0.1037946417927742, "rewards/gemini_judge_reward_func/std": 0.23698663711547852, "rewards/semantic_correctness_reward_func/mean": 0.4094816744327545, "rewards/semantic_correctness_reward_func/std": 0.21601319313049316, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 144.91964721679688, "completions/mean_terminated_length": 144.91964721679688, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.019802825316887884, "grad_norm": 0.024703042581677437, "kl": 0.00683748722076416, "learning_rate": 9.200000000000002e-06, "loss": -0.0068, "num_tokens": 20557377.0, "reward": 0.444562703371048, "reward_std": 0.06522774696350098, "rewards/gemini_judge_reward_func/mean": 0.1183035746216774, "rewards/gemini_judge_reward_func/std": 0.20355534553527832, "rewards/semantic_correctness_reward_func/mean": 0.4221436679363251, "rewards/semantic_correctness_reward_func/std": 0.20329251885414124, "rewards/xmlcount_reward_func/mean": 0.7820313572883606, "rewards/xmlcount_reward_func/std": 0.41473886370658875, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 151.1919708251953, "completions/mean_terminated_length": 139.34390258789062, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.020144253339592848, "grad_norm": 0.028883758932352066, "kl": 0.010497450828552246, "learning_rate": 9.4e-06, "loss": -0.0058, "num_tokens": 20920732.0, "reward": 0.44578060507774353, "reward_std": 0.07391282916069031, "rewards/gemini_judge_reward_func/mean": 0.1238839253783226, "rewards/gemini_judge_reward_func/std": 0.2527846693992615, "rewards/semantic_correctness_reward_func/mean": 0.4438849985599518, "rewards/semantic_correctness_reward_func/std": 0.2132381945848465, "rewards/xmlcount_reward_func/mean": 0.7686249613761902, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 148.88839721679688, "completions/mean_terminated_length": 144.96412658691406, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.020485681362297812, "grad_norm": 0.026241201907396317, "kl": 0.00708240270614624, "learning_rate": 9.600000000000001e-06, "loss": -0.016, "num_tokens": 21251643.0, "reward": 0.433006227016449, "reward_std": 0.06476236879825592, "rewards/gemini_judge_reward_func/mean": 0.1261160671710968, "rewards/gemini_judge_reward_func/std": 0.22584888339042664, "rewards/semantic_correctness_reward_func/mean": 0.4112989008426666, "rewards/semantic_correctness_reward_func/std": 0.23198209702968597, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 163.16519165039062, "completions/mean_terminated_length": 143.51141357421875, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.020827109385002773, "grad_norm": 0.025193244218826294, "kl": 0.007884740829467773, "learning_rate": 9.800000000000001e-06, "loss": -0.0213, "num_tokens": 21602088.0, "reward": 0.45057666301727295, "reward_std": 0.0753309428691864, "rewards/gemini_judge_reward_func/mean": 0.1183035746216774, "rewards/gemini_judge_reward_func/std": 0.23784302175045013, "rewards/semantic_correctness_reward_func/mean": 0.4097670912742615, "rewards/semantic_correctness_reward_func/std": 0.2002406269311905, "rewards/xmlcount_reward_func/mean": 0.8032545447349548, "rewards/xmlcount_reward_func/std": 0.3983818590641022, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 143.79464721679688, "completions/mean_terminated_length": 139.8475341796875, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.021168537407707737, "grad_norm": 0.02755032107234001, "kl": 0.006412327289581299, "learning_rate": 1e-05, "loss": -0.0059, "num_tokens": 21971558.0, "reward": 0.44730687141418457, "reward_std": 0.05102415755391121, "rewards/gemini_judge_reward_func/mean": 0.0848214253783226, "rewards/gemini_judge_reward_func/std": 0.19394874572753906, "rewards/semantic_correctness_reward_func/mean": 0.386641263961792, "rewards/semantic_correctness_reward_func/std": 0.18178777396678925, "rewards/xmlcount_reward_func/mean": 0.8401250839233398, "rewards/xmlcount_reward_func/std": 0.3684578537940979, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 158.55804443359375, "completions/mean_terminated_length": 150.76126098632812, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.0215099654304127, "grad_norm": 0.024078436195850372, "kl": 0.007021784782409668, "learning_rate": 9.999972660400536e-06, "loss": -0.0319, "num_tokens": 22334655.0, "reward": 0.460908979177475, "reward_std": 0.0708736777305603, "rewards/gemini_judge_reward_func/mean": 0.1305803507566452, "rewards/gemini_judge_reward_func/std": 0.27298402786254883, "rewards/semantic_correctness_reward_func/mean": 0.43463388085365295, "rewards/semantic_correctness_reward_func/std": 0.2317088544368744, "rewards/xmlcount_reward_func/mean": 0.8043750524520874, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 160.65179443359375, "completions/mean_terminated_length": 136.88990783691406, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.021851393453117665, "grad_norm": 0.024365782737731934, "kl": 0.0076389312744140625, "learning_rate": 9.999890641901124e-06, "loss": -0.0079, "num_tokens": 22692925.0, "reward": 0.4045267403125763, "reward_std": 0.06387098878622055, "rewards/gemini_judge_reward_func/mean": 0.1372767835855484, "rewards/gemini_judge_reward_func/std": 0.23524853587150574, "rewards/semantic_correctness_reward_func/mean": 0.4610799252986908, "rewards/semantic_correctness_reward_func/std": 0.19871395826339722, "rewards/xmlcount_reward_func/mean": 0.643500030040741, "rewards/xmlcount_reward_func/std": 0.48071083426475525, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 146.6741180419922, "completions/mean_terminated_length": 146.6741180419922, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.02219282147582263, "grad_norm": 0.023962823674082756, "kl": 0.006612420082092285, "learning_rate": 9.999753945398704e-06, "loss": -0.0137, "num_tokens": 23054064.0, "reward": 0.4475335478782654, "reward_std": 0.06912019103765488, "rewards/gemini_judge_reward_func/mean": 0.1428571492433548, "rewards/gemini_judge_reward_func/std": 0.2683078646659851, "rewards/semantic_correctness_reward_func/mean": 0.414703369140625, "rewards/semantic_correctness_reward_func/std": 0.20449979603290558, "rewards/xmlcount_reward_func/mean": 0.7686249613761902, "rewards/xmlcount_reward_func/std": 0.42356839776039124, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 160.45982360839844, "completions/mean_terminated_length": 144.7590789794922, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.02253424949852759, "grad_norm": 0.02477749064564705, "kl": 0.011017203330993652, "learning_rate": 9.99956257238817e-06, "loss": -0.0039, "num_tokens": 23412271.0, "reward": 0.4194851815700531, "reward_std": 0.06413974612951279, "rewards/gemini_judge_reward_func/mean": 0.1305803507566452, "rewards/gemini_judge_reward_func/std": 0.24138078093528748, "rewards/semantic_correctness_reward_func/mean": 0.44201499223709106, "rewards/semantic_correctness_reward_func/std": 0.2022552639245987, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903594970703, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 152.97769165039062, "completions/mean_terminated_length": 152.97769165039062, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.022875677521232554, "grad_norm": 0.02420092560350895, "kl": 0.004943966865539551, "learning_rate": 9.999316524962347e-06, "loss": 0.0069, "num_tokens": 23778402.0, "reward": 0.45530790090560913, "reward_std": 0.06338375061750412, "rewards/gemini_judge_reward_func/mean": 0.1216517835855484, "rewards/gemini_judge_reward_func/std": 0.21953363716602325, "rewards/semantic_correctness_reward_func/mean": 0.43342334032058716, "rewards/semantic_correctness_reward_func/std": 0.19556362926959991, "rewards/xmlcount_reward_func/mean": 0.799906313419342, "rewards/xmlcount_reward_func/std": 0.40196701884269714, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 160.02679443359375, "completions/mean_terminated_length": 148.29864501953125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.02321710554393752, "grad_norm": 0.024799056351184845, "kl": 0.012816905975341797, "learning_rate": 9.999015805811965e-06, "loss": -0.0062, "num_tokens": 24150064.0, "reward": 0.4068303406238556, "reward_std": 0.0636182427406311, "rewards/gemini_judge_reward_func/mean": 0.1116071417927742, "rewards/gemini_judge_reward_func/std": 0.20322315394878387, "rewards/semantic_correctness_reward_func/mean": 0.4032764434814453, "rewards/semantic_correctness_reward_func/std": 0.2195780724287033, "rewards/xmlcount_reward_func/mean": 0.7038304209709167, "rewards/xmlcount_reward_func/std": 0.4546702206134796, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 1024.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 181.34376525878906, "completions/mean_terminated_length": 150.13426208496094, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.023558533566642482, "grad_norm": 0.02307036519050598, "kl": 0.005774140357971191, "learning_rate": 9.998660418225645e-06, "loss": -0.0188, "num_tokens": 24533433.0, "reward": 0.3904527425765991, "reward_std": 0.06628313660621643, "rewards/gemini_judge_reward_func/mean": 0.0870535746216774, "rewards/gemini_judge_reward_func/std": 0.1809597611427307, "rewards/semantic_correctness_reward_func/mean": 0.39731696248054504, "rewards/semantic_correctness_reward_func/std": 0.19073733687400818, "rewards/xmlcount_reward_func/mean": 0.6904196739196777, "rewards/xmlcount_reward_func/std": 0.4628920555114746, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 159.8928680419922, "completions/mean_terminated_length": 140.1643829345703, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.023899961589347447, "grad_norm": 0.025376953184604645, "kl": 0.00702059268951416, "learning_rate": 9.998250366089848e-06, "loss": -0.0119, "num_tokens": 24890453.0, "reward": 0.3956013023853302, "reward_std": 0.062416452914476395, "rewards/gemini_judge_reward_func/mean": 0.1049107164144516, "rewards/gemini_judge_reward_func/std": 0.18533207476139069, "rewards/semantic_correctness_reward_func/mean": 0.4300599694252014, "rewards/semantic_correctness_reward_func/std": 0.21468216180801392, "rewards/xmlcount_reward_func/mean": 0.6690624952316284, "rewards/xmlcount_reward_func/std": 0.5187152624130249, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 157.625, "completions/mean_terminated_length": 149.81982421875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.02424138961205241, "grad_norm": 0.025495875626802444, "kl": 0.005542397499084473, "learning_rate": 9.997785653888835e-06, "loss": 0.0107, "num_tokens": 25250817.0, "reward": 0.3918258845806122, "reward_std": 0.04926810413599014, "rewards/gemini_judge_reward_func/mean": 0.0591517873108387, "rewards/gemini_judge_reward_func/std": 0.16611815989017487, "rewards/semantic_correctness_reward_func/mean": 0.3750758469104767, "rewards/semantic_correctness_reward_func/std": 0.16816116869449615, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 161.25894165039062, "completions/mean_terminated_length": 149.54751586914062, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.02458281763475737, "grad_norm": 0.02477003075182438, "kl": 0.008527755737304688, "learning_rate": 9.99726628670463e-06, "loss": -0.0166, "num_tokens": 25625067.0, "reward": 0.4139401614665985, "reward_std": 0.07099711894989014, "rewards/gemini_judge_reward_func/mean": 0.1417410671710968, "rewards/gemini_judge_reward_func/std": 0.29332807660102844, "rewards/semantic_correctness_reward_func/mean": 0.4232363700866699, "rewards/semantic_correctness_reward_func/std": 0.21512280404567719, "rewards/xmlcount_reward_func/mean": 0.6814910769462585, "rewards/xmlcount_reward_func/std": 0.4640570878982544, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 155.7857208251953, "completions/mean_terminated_length": 151.8923797607422, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.024924245657462336, "grad_norm": 0.025401996448636055, "kl": 0.0072678327560424805, "learning_rate": 9.996692270216946e-06, "loss": 0.0065, "num_tokens": 25947051.0, "reward": 0.4428517818450928, "reward_std": 0.07458332180976868, "rewards/gemini_judge_reward_func/mean": 0.1205357164144516, "rewards/gemini_judge_reward_func/std": 0.23313553631305695, "rewards/semantic_correctness_reward_func/mean": 0.43593719601631165, "rewards/semantic_correctness_reward_func/std": 0.20209956169128418, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 160.8303680419922, "completions/mean_terminated_length": 149.11312866210938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.0252656736801673, "grad_norm": 0.023690922185778618, "kl": 0.008687734603881836, "learning_rate": 9.996063610703138e-06, "loss": -0.0026, "num_tokens": 26289245.0, "reward": 0.4326671361923218, "reward_std": 0.07397673279047012, "rewards/gemini_judge_reward_func/mean": 0.1484375, "rewards/gemini_judge_reward_func/std": 0.2781420946121216, "rewards/semantic_correctness_reward_func/mean": 0.4355142414569855, "rewards/semantic_correctness_reward_func/std": 0.20559628307819366, "rewards/xmlcount_reward_func/mean": 0.7154732942581177, "rewards/xmlcount_reward_func/std": 0.5058081746101379, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 165.9553680419922, "completions/mean_terminated_length": 142.33944702148438, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.025607101702872264, "grad_norm": 0.02803461067378521, "kl": 0.01035165786743164, "learning_rate": 9.995380315038119e-06, "loss": 0.0036, "num_tokens": 26674083.0, "reward": 0.3838358223438263, "reward_std": 0.06592860817909241, "rewards/gemini_judge_reward_func/mean": 0.1372767835855484, "rewards/gemini_judge_reward_func/std": 0.25469791889190674, "rewards/semantic_correctness_reward_func/mean": 0.4291253685951233, "rewards/semantic_correctness_reward_func/std": 0.21939414739608765, "rewards/xmlcount_reward_func/mean": 0.6077500581741333, "rewards/xmlcount_reward_func/std": 0.48996880650520325, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 158.9866180419922, "completions/mean_terminated_length": 158.9866180419922, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.025948529725577228, "grad_norm": 0.025290068238973618, "kl": 0.006925344467163086, "learning_rate": 9.994642390694308e-06, "loss": -0.0185, "num_tokens": 27000188.0, "reward": 0.44614139199256897, "reward_std": 0.043179940432310104, "rewards/gemini_judge_reward_func/mean": 0.0770089253783226, "rewards/gemini_judge_reward_func/std": 0.19360975921154022, "rewards/semantic_correctness_reward_func/mean": 0.3964388370513916, "rewards/semantic_correctness_reward_func/std": 0.1987782120704651, "rewards/xmlcount_reward_func/mean": 0.8401250839233398, "rewards/xmlcount_reward_func/std": 0.3684578835964203, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 159.27232360839844, "completions/mean_terminated_length": 151.4819793701172, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.02628995774828219, "grad_norm": 0.023038053885102272, "kl": 0.009779095649719238, "learning_rate": 9.993849845741525e-06, "loss": -0.0165, "num_tokens": 27375557.0, "reward": 0.40907853841781616, "reward_std": 0.07521604001522064, "rewards/gemini_judge_reward_func/mean": 0.1339285671710968, "rewards/gemini_judge_reward_func/std": 0.2708328068256378, "rewards/semantic_correctness_reward_func/mean": 0.419696182012558, "rewards/semantic_correctness_reward_func/std": 0.20244161784648895, "rewards/xmlcount_reward_func/mean": 0.6789196729660034, "rewards/xmlcount_reward_func/std": 0.4654209613800049, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 167.2366180419922, "completions/mean_terminated_length": 155.60633850097656, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.026631385770987153, "grad_norm": 0.023966865614056587, "kl": 0.009274482727050781, "learning_rate": 9.993002688846913e-06, "loss": 0.0102, "num_tokens": 27739970.0, "reward": 0.4366755783557892, "reward_std": 0.07375102490186691, "rewards/gemini_judge_reward_func/mean": 0.1439732164144516, "rewards/gemini_judge_reward_func/std": 0.2324018031358719, "rewards/semantic_correctness_reward_func/mean": 0.4509579837322235, "rewards/semantic_correctness_reward_func/std": 0.18391193449497223, "rewards/xmlcount_reward_func/mean": 0.7222366333007812, "rewards/xmlcount_reward_func/std": 0.4450782239437103, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0535714285714286, "completions/max_length": 1024.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 188.74107360839844, "completions/mean_terminated_length": 141.46226501464844, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.026972813793692117, "grad_norm": 0.024487853050231934, "kl": 0.01374959945678711, "learning_rate": 9.992100929274848e-06, "loss": -0.0392, "num_tokens": 28137276.0, "reward": 0.35368138551712036, "reward_std": 0.06350585073232651, "rewards/gemini_judge_reward_func/mean": 0.1238839253783226, "rewards/gemini_judge_reward_func/std": 0.23437467217445374, "rewards/semantic_correctness_reward_func/mean": 0.4414603114128113, "rewards/semantic_correctness_reward_func/std": 0.19774943590164185, "rewards/xmlcount_reward_func/mean": 0.5395892858505249, "rewards/xmlcount_reward_func/std": 0.4981267750263214, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 164.45089721679688, "completions/mean_terminated_length": 140.7935791015625, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.02731424181639708, "grad_norm": 0.02403208427131176, "kl": 0.011779546737670898, "learning_rate": 9.991144576886824e-06, "loss": -0.0213, "num_tokens": 28517273.0, "reward": 0.3964942395687103, "reward_std": 0.058009881526231766, "rewards/gemini_judge_reward_func/mean": 0.1361607164144516, "rewards/gemini_judge_reward_func/std": 0.24118632078170776, "rewards/semantic_correctness_reward_func/mean": 0.4231494963169098, "rewards/semantic_correctness_reward_func/std": 0.2020299881696701, "rewards/xmlcount_reward_func/mean": 0.643500030040741, "rewards/xmlcount_reward_func/std": 0.48071083426475525, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 161.32144165039062, "completions/mean_terminated_length": 157.4529266357422, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.027655669839102046, "grad_norm": 0.022105740383267403, "kl": 0.009316205978393555, "learning_rate": 9.990133642141359e-06, "loss": 0.0047, "num_tokens": 28865717.0, "reward": 0.44927382469177246, "reward_std": 0.06996231526136398, "rewards/gemini_judge_reward_func/mean": 0.1294642835855484, "rewards/gemini_judge_reward_func/std": 0.2233111709356308, "rewards/semantic_correctness_reward_func/mean": 0.4501902759075165, "rewards/semantic_correctness_reward_func/std": 0.20007607340812683, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 164.7857208251953, "completions/mean_terminated_length": 149.16363525390625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.027997097861807006, "grad_norm": 0.02345276065170765, "kl": 0.012184381484985352, "learning_rate": 9.989068136093873e-06, "loss": -0.0162, "num_tokens": 29228517.0, "reward": 0.44200220704078674, "reward_std": 0.08123025298118591, "rewards/gemini_judge_reward_func/mean": 0.1529017835855484, "rewards/gemini_judge_reward_func/std": 0.2866664230823517, "rewards/semantic_correctness_reward_func/mean": 0.4497699439525604, "rewards/semantic_correctness_reward_func/std": 0.21584469079971313, "rewards/xmlcount_reward_func/mean": 0.727218747138977, "rewards/xmlcount_reward_func/std": 0.4462124705314636, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 150.43304443359375, "completions/mean_terminated_length": 150.43304443359375, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.02833852588451197, "grad_norm": 0.022828485816717148, "kl": 0.011949777603149414, "learning_rate": 9.987948070396572e-06, "loss": -0.0194, "num_tokens": 29601014.0, "reward": 0.42945489287376404, "reward_std": 0.06218741089105606, "rewards/gemini_judge_reward_func/mean": 0.140625, "rewards/gemini_judge_reward_func/std": 0.24440200626850128, "rewards/semantic_correctness_reward_func/mean": 0.4360244572162628, "rewards/semantic_correctness_reward_func/std": 0.2285270392894745, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 168.99107360839844, "completions/mean_terminated_length": 149.47030639648438, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.028679953907216935, "grad_norm": 0.025373326614499092, "kl": 0.01007533073425293, "learning_rate": 9.986773457298311e-06, "loss": -0.0258, "num_tokens": 29964904.0, "reward": 0.4488268494606018, "reward_std": 0.07115625590085983, "rewards/gemini_judge_reward_func/mean": 0.1071428582072258, "rewards/gemini_judge_reward_func/std": 0.2136441320180893, "rewards/semantic_correctness_reward_func/mean": 0.4210982918739319, "rewards/semantic_correctness_reward_func/std": 0.20950660109519958, "rewards/xmlcount_reward_func/mean": 0.8043751120567322, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 155.44644165039062, "completions/mean_terminated_length": 147.6216278076172, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.0290213819299219, "grad_norm": 0.024450762197375298, "kl": 0.011575698852539062, "learning_rate": 9.985544309644474e-06, "loss": -0.0256, "num_tokens": 30329712.0, "reward": 0.4297153353691101, "reward_std": 0.06669414043426514, "rewards/gemini_judge_reward_func/mean": 0.1194196417927742, "rewards/gemini_judge_reward_func/std": 0.20902319252490997, "rewards/semantic_correctness_reward_func/mean": 0.4439873695373535, "rewards/semantic_correctness_reward_func/std": 0.18770352005958557, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 166.62054443359375, "completions/mean_terminated_length": 151.0318145751953, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.029362809952626863, "grad_norm": 0.02488037198781967, "kl": 0.009441137313842773, "learning_rate": 9.984260640876821e-06, "loss": -0.0263, "num_tokens": 30661951.0, "reward": 0.44441041350364685, "reward_std": 0.05848705768585205, "rewards/gemini_judge_reward_func/mean": 0.109375, "rewards/gemini_judge_reward_func/std": 0.23385359346866608, "rewards/semantic_correctness_reward_func/mean": 0.3945517838001251, "rewards/semantic_correctness_reward_func/std": 0.2169126272201538, "rewards/xmlcount_reward_func/mean": 0.8043751120567322, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 169.80357360839844, "completions/mean_terminated_length": 146.2935791015625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.029704237975331824, "grad_norm": 0.023754147812724113, "kl": 0.011398077011108398, "learning_rate": 9.98292246503335e-06, "loss": -0.0201, "num_tokens": 31028499.0, "reward": 0.39785102009773254, "reward_std": 0.05590759217739105, "rewards/gemini_judge_reward_func/mean": 0.1272321492433548, "rewards/gemini_judge_reward_func/std": 0.2527772784233093, "rewards/semantic_correctness_reward_func/mean": 0.44779059290885925, "rewards/semantic_correctness_reward_func/std": 0.1991998553276062, "rewards/xmlcount_reward_func/mean": 0.643500030040741, "rewards/xmlcount_reward_func/std": 0.48071083426475525, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 156.375, "completions/mean_terminated_length": 148.55856323242188, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.030045665998036788, "grad_norm": 0.023754147812724113, "kl": 0.009763479232788086, "learning_rate": 9.98292246503335e-06, "loss": 0.0098, "num_tokens": 31364211.0, "reward": 0.4608972370624542, "reward_std": 0.0726761519908905, "rewards/gemini_judge_reward_func/mean": 0.1428571492433548, "rewards/gemini_judge_reward_func/std": 0.23607943952083588, "rewards/semantic_correctness_reward_func/mean": 0.48152169585227966, "rewards/semantic_correctness_reward_func/std": 0.17897123098373413, "rewards/xmlcount_reward_func/mean": 0.7686249613761902, "rewards/xmlcount_reward_func/std": 0.42356839776039124, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 166.60269165039062, "completions/mean_terminated_length": 147.0273895263672, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.030387094020741752, "grad_norm": 0.02381015755236149, "kl": 0.014133691787719727, "learning_rate": 9.981529796748135e-06, "loss": 0.0053, "num_tokens": 31723014.0, "reward": 0.4156407117843628, "reward_std": 0.06060226634144783, "rewards/gemini_judge_reward_func/mean": 0.1082589253783226, "rewards/gemini_judge_reward_func/std": 0.23136581480503082, "rewards/semantic_correctness_reward_func/mean": 0.43168550729751587, "rewards/semantic_correctness_reward_func/std": 0.21562719345092773, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 162.58482360839844, "completions/mean_terminated_length": 154.82432556152344, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.030728522043446716, "grad_norm": 0.02302616834640503, "kl": 0.008588314056396484, "learning_rate": 9.980082651251175e-06, "loss": -0.0421, "num_tokens": 32084121.0, "reward": 0.43415945768356323, "reward_std": 0.07369165122509003, "rewards/gemini_judge_reward_func/mean": 0.1071428582072258, "rewards/gemini_judge_reward_func/std": 0.22389310598373413, "rewards/semantic_correctness_reward_func/mean": 0.45501139760017395, "rewards/semantic_correctness_reward_func/std": 0.21243339776992798, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 166.32144165039062, "completions/mean_terminated_length": 146.73971557617188, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.03106995006615168, "grad_norm": 0.02333798073232174, "kl": 0.014536380767822266, "learning_rate": 9.97858104436822e-06, "loss": -0.0453, "num_tokens": 32455833.0, "reward": 0.38213050365448, "reward_std": 0.05550408363342285, "rewards/gemini_judge_reward_func/mean": 0.0915178582072258, "rewards/gemini_judge_reward_func/std": 0.21174997091293335, "rewards/semantic_correctness_reward_func/mean": 0.40486663579940796, "rewards/semantic_correctness_reward_func/std": 0.2065107375383377, "rewards/xmlcount_reward_func/mean": 0.6613750457763672, "rewards/xmlcount_reward_func/std": 0.47500187158584595, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 160.05804443359375, "completions/mean_terminated_length": 140.3333282470703, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.03141137808885664, "grad_norm": 0.023909490555524826, "kl": 0.010914802551269531, "learning_rate": 9.977024992520604e-06, "loss": -0.0204, "num_tokens": 32794082.0, "reward": 0.4354327321052551, "reward_std": 0.06427149474620819, "rewards/gemini_judge_reward_func/mean": 0.1350446492433548, "rewards/gemini_judge_reward_func/std": 0.23890070617198944, "rewards/semantic_correctness_reward_func/mean": 0.4413240849971771, "rewards/semantic_correctness_reward_func/std": 0.20441272854804993, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 164.6875, "completions/mean_terminated_length": 149.06362915039062, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.03175280611156161, "grad_norm": 0.023738721385598183, "kl": 0.0113372802734375, "learning_rate": 9.975414512725058e-06, "loss": -0.027, "num_tokens": 33150324.0, "reward": 0.41209447383880615, "reward_std": 0.08080209791660309, "rewards/gemini_judge_reward_func/mean": 0.1383928507566452, "rewards/gemini_judge_reward_func/std": 0.23637604713439941, "rewards/semantic_correctness_reward_func/mean": 0.4341597557067871, "rewards/semantic_correctness_reward_func/std": 0.2263120412826538, "rewards/xmlcount_reward_func/mean": 0.6747633814811707, "rewards/xmlcount_reward_func/std": 0.4702146351337433, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 170.7857208251953, "completions/mean_terminated_length": 151.30592346191406, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.03209423413426657, "grad_norm": 0.024146920070052147, "kl": 0.011864662170410156, "learning_rate": 9.973749622593534e-06, "loss": 0.0014, "num_tokens": 33538188.0, "reward": 0.411119282245636, "reward_std": 0.057556502521038055, "rewards/gemini_judge_reward_func/mean": 0.1171875, "rewards/gemini_judge_reward_func/std": 0.22571587562561035, "rewards/semantic_correctness_reward_func/mean": 0.4269711673259735, "rewards/semantic_correctness_reward_func/std": 0.21064431965351105, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903892993927, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 160.40179443359375, "completions/mean_terminated_length": 156.52915954589844, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.03243566215697153, "grad_norm": 0.024719731882214546, "kl": 0.010405540466308594, "learning_rate": 9.972030340333e-06, "loss": -0.0024, "num_tokens": 33882934.0, "reward": 0.44124558568000793, "reward_std": 0.07164882123470306, "rewards/gemini_judge_reward_func/mean": 0.1160714253783226, "rewards/gemini_judge_reward_func/std": 0.22814461588859558, "rewards/semantic_correctness_reward_func/mean": 0.4279598593711853, "rewards/semantic_correctness_reward_func/std": 0.19526031613349915, "rewards/xmlcount_reward_func/mean": 0.7730625867843628, "rewards/xmlcount_reward_func/std": 0.4180354177951813, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 156.02679443359375, "completions/mean_terminated_length": 144.24435424804688, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.0327770901796765, "grad_norm": 0.02348247356712818, "kl": 0.009704351425170898, "learning_rate": 9.970256684745258e-06, "loss": -0.0143, "num_tokens": 34226852.0, "reward": 0.43676093220710754, "reward_std": 0.06913831830024719, "rewards/gemini_judge_reward_func/mean": 0.1350446492433548, "rewards/gemini_judge_reward_func/std": 0.2558937668800354, "rewards/semantic_correctness_reward_func/mean": 0.46584004163742065, "rewards/semantic_correctness_reward_func/std": 0.2180854231119156, "rewards/xmlcount_reward_func/mean": 0.7239375114440918, "rewards/xmlcount_reward_func/std": 0.4488600790500641, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 185.54019165039062, "completions/mean_terminated_length": 158.4930877685547, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.03311851820238146, "grad_norm": 0.02116106078028679, "kl": 0.008137702941894531, "learning_rate": 9.968428675226714e-06, "loss": 0.0093, "num_tokens": 34568397.0, "reward": 0.4413779377937317, "reward_std": 0.05757666751742363, "rewards/gemini_judge_reward_func/mean": 0.1138392835855484, "rewards/gemini_judge_reward_func/std": 0.22927159070968628, "rewards/semantic_correctness_reward_func/mean": 0.42639848589897156, "rewards/semantic_correctness_reward_func/std": 0.20541919767856598, "rewards/xmlcount_reward_func/mean": 0.7764062881469727, "rewards/xmlcount_reward_func/std": 0.4174662232398987, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 153.9732208251953, "completions/mean_terminated_length": 146.1351318359375, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.033459946225086426, "grad_norm": 0.02458954229950905, "kl": 0.012489795684814453, "learning_rate": 9.966546331768192e-06, "loss": -0.029, "num_tokens": 34916131.0, "reward": 0.4533292353153229, "reward_std": 0.05946576967835426, "rewards/gemini_judge_reward_func/mean": 0.1540178507566452, "rewards/gemini_judge_reward_func/std": 0.2796248495578766, "rewards/semantic_correctness_reward_func/mean": 0.45711034536361694, "rewards/semantic_correctness_reward_func/std": 0.23785069584846497, "rewards/xmlcount_reward_func/mean": 0.7507500648498535, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 152.41519165039062, "completions/mean_terminated_length": 144.5630645751953, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.03380137424779139, "grad_norm": 0.02564327046275139, "kl": 0.01104736328125, "learning_rate": 9.964609674954696e-06, "loss": -0.0008, "num_tokens": 35263604.0, "reward": 0.45329129695892334, "reward_std": 0.08842761814594269, "rewards/gemini_judge_reward_func/mean": 0.1540178507566452, "rewards/gemini_judge_reward_func/std": 0.26096048951148987, "rewards/semantic_correctness_reward_func/mean": 0.45692071318626404, "rewards/semantic_correctness_reward_func/std": 0.2106001377105713, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 161.0803680419922, "completions/mean_terminated_length": 137.3302764892578, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.034142802270496354, "grad_norm": 0.02536987140774727, "kl": 0.011751174926757812, "learning_rate": 9.962618725965196e-06, "loss": -0.0198, "num_tokens": 35630690.0, "reward": 0.4004952311515808, "reward_std": 0.06513310968875885, "rewards/gemini_judge_reward_func/mean": 0.1339285671710968, "rewards/gemini_judge_reward_func/std": 0.26027876138687134, "rewards/semantic_correctness_reward_func/mean": 0.44761887192726135, "rewards/semantic_correctness_reward_func/std": 0.19793058931827545, "rewards/xmlcount_reward_func/mean": 0.643500030040741, "rewards/xmlcount_reward_func/std": 0.48071083426475525, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 160.3928680419922, "completions/mean_terminated_length": 144.69090270996094, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.034484230293201315, "grad_norm": 0.024176469072699547, "kl": 0.01227426528930664, "learning_rate": 9.960573506572391e-06, "loss": -0.0191, "num_tokens": 36018366.0, "reward": 0.3939872682094574, "reward_std": 0.0706588476896286, "rewards/gemini_judge_reward_func/mean": 0.1227678582072258, "rewards/gemini_judge_reward_func/std": 0.22830908000469208, "rewards/semantic_correctness_reward_func/mean": 0.4105878472328186, "rewards/semantic_correctness_reward_func/std": 0.19481845200061798, "rewards/xmlcount_reward_func/mean": 0.656906247138977, "rewards/xmlcount_reward_func/std": 0.47649866342544556, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 153.43304443359375, "completions/mean_terminated_length": 153.43304443359375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.034825658315906276, "grad_norm": 0.025619763880968094, "kl": 0.00950765609741211, "learning_rate": 9.95847403914247e-06, "loss": -0.0065, "num_tokens": 36359395.0, "reward": 0.44875597953796387, "reward_std": 0.05499029532074928, "rewards/gemini_judge_reward_func/mean": 0.0770089253783226, "rewards/gemini_judge_reward_func/std": 0.16035409271717072, "rewards/semantic_correctness_reward_func/mean": 0.3737618029117584, "rewards/semantic_correctness_reward_func/std": 0.17840272188186646, "rewards/xmlcount_reward_func/mean": 0.8580000996589661, "rewards/xmlcount_reward_func/std": 0.35106155276298523, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 162.66519165039062, "completions/mean_terminated_length": 150.9728546142578, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.03516708633861124, "grad_norm": 0.022210268303751945, "kl": 0.01260066032409668, "learning_rate": 9.956320346634877e-06, "loss": -0.0227, "num_tokens": 36727936.0, "reward": 0.38284486532211304, "reward_std": 0.05238700285553932, "rewards/gemini_judge_reward_func/mean": 0.0993303582072258, "rewards/gemini_judge_reward_func/std": 0.2006424516439438, "rewards/semantic_correctness_reward_func/mean": 0.3570633828639984, "rewards/semantic_correctness_reward_func/std": 0.17993000149726868, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 148.9866180419922, "completions/mean_terminated_length": 145.0627899169922, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.035508514361316204, "grad_norm": 0.024317806586623192, "kl": 0.011048316955566406, "learning_rate": 9.954112452602045e-06, "loss": 0.004, "num_tokens": 37079877.0, "reward": 0.43635034561157227, "reward_std": 0.061006706207990646, "rewards/gemini_judge_reward_func/mean": 0.1227678582072258, "rewards/gemini_judge_reward_func/std": 0.22334477305412292, "rewards/semantic_correctness_reward_func/mean": 0.4347156882286072, "rewards/semantic_correctness_reward_func/std": 0.18455860018730164, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 146.26339721679688, "completions/mean_terminated_length": 138.35586547851562, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.03584994238402117, "grad_norm": 0.024815207347273827, "kl": 0.012839555740356445, "learning_rate": 9.951850381189152e-06, "loss": -0.0026, "num_tokens": 37415952.0, "reward": 0.44215184450149536, "reward_std": 0.072935089468956, "rewards/gemini_judge_reward_func/mean": 0.1361607164144516, "rewards/gemini_judge_reward_func/std": 0.22804586589336395, "rewards/semantic_correctness_reward_func/mean": 0.44141072034835815, "rewards/semantic_correctness_reward_func/std": 0.20808282494544983, "rewards/xmlcount_reward_func/mean": 0.748513400554657, "rewards/xmlcount_reward_func/std": 0.43441200256347656, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 146.9866180419922, "completions/mean_terminated_length": 146.9866180419922, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.03619137040672613, "grad_norm": 0.024026190862059593, "kl": 0.011193037033081055, "learning_rate": 9.949534157133844e-06, "loss": -0.031, "num_tokens": 37739521.0, "reward": 0.4355818033218384, "reward_std": 0.06153957173228264, "rewards/gemini_judge_reward_func/mean": 0.1205357164144516, "rewards/gemini_judge_reward_func/std": 0.225807324051857, "rewards/semantic_correctness_reward_func/mean": 0.4353373646736145, "rewards/semantic_correctness_reward_func/std": 0.20193377137184143, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 155.53125, "completions/mean_terminated_length": 155.53125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.03653279842943109, "grad_norm": 0.024166366085410118, "kl": 0.014576911926269531, "learning_rate": 9.94716380576598e-06, "loss": -0.0241, "num_tokens": 38103460.0, "reward": 0.40730592608451843, "reward_std": 0.05648142471909523, "rewards/gemini_judge_reward_func/mean": 0.0982142835855484, "rewards/gemini_judge_reward_func/std": 0.21662190556526184, "rewards/semantic_correctness_reward_func/mean": 0.4013420045375824, "rewards/semantic_correctness_reward_func/std": 0.19534234702587128, "rewards/xmlcount_reward_func/mean": 0.7193795442581177, "rewards/xmlcount_reward_func/std": 0.45101282000541687, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 153.33929443359375, "completions/mean_terminated_length": 145.49549865722656, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.03687422645213606, "grad_norm": 0.023334842175245285, "kl": 0.010807275772094727, "learning_rate": 9.944739353007344e-06, "loss": -0.0011, "num_tokens": 38444604.0, "reward": 0.44895491003990173, "reward_std": 0.0724000632762909, "rewards/gemini_judge_reward_func/mean": 0.1450892835855484, "rewards/gemini_judge_reward_func/std": 0.2784051299095154, "rewards/semantic_correctness_reward_func/mean": 0.45309582352638245, "rewards/semantic_correctness_reward_func/std": 0.2039255052804947, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 154.91519165039062, "completions/mean_terminated_length": 147.08558654785156, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.03721565447484102, "grad_norm": 0.023959942162036896, "kl": 0.013292789459228516, "learning_rate": 9.942260825371359e-06, "loss": -0.0149, "num_tokens": 38779941.0, "reward": 0.44470787048339844, "reward_std": 0.057739898562431335, "rewards/gemini_judge_reward_func/mean": 0.1049107164144516, "rewards/gemini_judge_reward_func/std": 0.21080242097377777, "rewards/semantic_correctness_reward_func/mean": 0.44071775674819946, "rewards/semantic_correctness_reward_func/std": 0.1816713809967041, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 152.68304443359375, "completions/mean_terminated_length": 144.83334350585938, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.03755708249754599, "grad_norm": 0.024471363052725792, "kl": 0.014130592346191406, "learning_rate": 9.939728249962808e-06, "loss": -0.0143, "num_tokens": 39145830.0, "reward": 0.38968026638031006, "reward_std": 0.059608135372400284, "rewards/gemini_judge_reward_func/mean": 0.1015625, "rewards/gemini_judge_reward_func/std": 0.21571606397628784, "rewards/semantic_correctness_reward_func/mean": 0.3867761790752411, "rewards/semantic_correctness_reward_func/std": 0.2074183076620102, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 151.26339721679688, "completions/mean_terminated_length": 147.3497772216797, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.03789851052025095, "grad_norm": 0.02474793791770935, "kl": 0.012769222259521484, "learning_rate": 9.937141654477529e-06, "loss": -0.0224, "num_tokens": 39505753.0, "reward": 0.40522071719169617, "reward_std": 0.06155985966324806, "rewards/gemini_judge_reward_func/mean": 0.1060267835855484, "rewards/gemini_judge_reward_func/std": 0.22875528037548065, "rewards/semantic_correctness_reward_func/mean": 0.4198000431060791, "rewards/semantic_correctness_reward_func/std": 0.1814102977514267, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903594970703, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 151.0803680419922, "completions/mean_terminated_length": 139.23077392578125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.03823993854295591, "grad_norm": 0.02324873022735119, "kl": 0.01199030876159668, "learning_rate": 9.934501067202117e-06, "loss": -0.0208, "num_tokens": 39904871.0, "reward": 0.411319375038147, "reward_std": 0.0837107002735138, "rewards/gemini_judge_reward_func/mean": 0.1316964328289032, "rewards/gemini_judge_reward_func/std": 0.2739836871623993, "rewards/semantic_correctness_reward_func/mean": 0.45257896184921265, "rewards/semantic_correctness_reward_func/std": 0.22245247662067413, "rewards/xmlcount_reward_func/mean": 0.6703125238418579, "rewards/xmlcount_reward_func/std": 0.4718664884567261, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 156.05804443359375, "completions/mean_terminated_length": 144.27603149414062, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.03858136656566088, "grad_norm": 0.025134863331913948, "kl": 0.011367321014404297, "learning_rate": 9.931806517013612e-06, "loss": 0.0098, "num_tokens": 40254420.0, "reward": 0.43234628438949585, "reward_std": 0.08657613396644592, "rewards/gemini_judge_reward_func/mean": 0.1272321492433548, "rewards/gemini_judge_reward_func/std": 0.24603478610515594, "rewards/semantic_correctness_reward_func/mean": 0.4244635999202728, "rewards/semantic_correctness_reward_func/std": 0.20944832265377045, "rewards/xmlcount_reward_func/mean": 0.7414017915725708, "rewards/xmlcount_reward_func/std": 0.4350353181362152, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 158.1607208251953, "completions/mean_terminated_length": 150.36036682128906, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.03892279458836584, "grad_norm": 0.023998796939849854, "kl": 0.015682697296142578, "learning_rate": 9.929058033379181e-06, "loss": -0.0196, "num_tokens": 40596484.0, "reward": 0.415109246969223, "reward_std": 0.05301095172762871, "rewards/gemini_judge_reward_func/mean": 0.0703125, "rewards/gemini_judge_reward_func/std": 0.16341476142406464, "rewards/semantic_correctness_reward_func/mean": 0.39767107367515564, "rewards/semantic_correctness_reward_func/std": 0.1908901482820511, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 152.52232360839844, "completions/mean_terminated_length": 140.6923065185547, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.039264222611070806, "grad_norm": 0.02244039997458458, "kl": 0.013239383697509766, "learning_rate": 9.926255646355804e-06, "loss": 0.0074, "num_tokens": 40948209.0, "reward": 0.43548721075057983, "reward_std": 0.06909479200839996, "rewards/gemini_judge_reward_func/mean": 0.171875, "rewards/gemini_judge_reward_func/std": 0.2720773220062256, "rewards/semantic_correctness_reward_func/mean": 0.4166412055492401, "rewards/semantic_correctness_reward_func/std": 0.2098981738090515, "rewards/xmlcount_reward_func/mean": 0.7085223197937012, "rewards/xmlcount_reward_func/std": 0.4550124406814575, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 161.6294708251953, "completions/mean_terminated_length": 153.86036682128906, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.03960565063377577, "grad_norm": 0.022398851811885834, "kl": 0.010973930358886719, "learning_rate": 9.923399386589933e-06, "loss": -0.0155, "num_tokens": 41311642.0, "reward": 0.4142858684062958, "reward_std": 0.05273973196744919, "rewards/gemini_judge_reward_func/mean": 0.1361607164144516, "rewards/gemini_judge_reward_func/std": 0.2514272928237915, "rewards/semantic_correctness_reward_func/mean": 0.4406077563762665, "rewards/semantic_correctness_reward_func/std": 0.2008506804704666, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 150.70982360839844, "completions/mean_terminated_length": 146.79373168945312, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.03994707865648073, "grad_norm": 0.023609992116689682, "kl": 0.015387296676635742, "learning_rate": 9.920489285317169e-06, "loss": 0.0, "num_tokens": 41676577.0, "reward": 0.4282933175563812, "reward_std": 0.06423311680555344, "rewards/gemini_judge_reward_func/mean": 0.1049107164144516, "rewards/gemini_judge_reward_func/std": 0.19847622513771057, "rewards/semantic_correctness_reward_func/mean": 0.4301450550556183, "rewards/semantic_correctness_reward_func/std": 0.18929553031921387, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 165.3303680419922, "completions/mean_terminated_length": 149.71817016601562, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.040288506679185696, "grad_norm": 0.024569395929574966, "kl": 0.012789726257324219, "learning_rate": 9.917525374361913e-06, "loss": 0.0035, "num_tokens": 42018127.0, "reward": 0.4590550661087036, "reward_std": 0.06488455832004547, "rewards/gemini_judge_reward_func/mean": 0.1484375, "rewards/gemini_judge_reward_func/std": 0.24146370589733124, "rewards/semantic_correctness_reward_func/mean": 0.47008776664733887, "rewards/semantic_correctness_reward_func/std": 0.2246067225933075, "rewards/xmlcount_reward_func/mean": 0.7641563415527344, "rewards/xmlcount_reward_func/std": 0.4263768792152405, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 143.3928680419922, "completions/mean_terminated_length": 139.44395446777344, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.040629934701890656, "grad_norm": 0.025357214733958244, "kl": 0.016598224639892578, "learning_rate": 9.91450768613702e-06, "loss": -0.0098, "num_tokens": 42389311.0, "reward": 0.39798423647880554, "reward_std": 0.06020635738968849, "rewards/gemini_judge_reward_func/mean": 0.0970982164144516, "rewards/gemini_judge_reward_func/std": 0.22159849107265472, "rewards/semantic_correctness_reward_func/mean": 0.4014746844768524, "rewards/semantic_correctness_reward_func/std": 0.19921056926250458, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903594970703, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 163.8616180419922, "completions/mean_terminated_length": 152.18553161621094, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.040971362724595624, "grad_norm": 0.02301410585641861, "kl": 0.012985706329345703, "learning_rate": 9.911436253643445e-06, "loss": -0.0195, "num_tokens": 42740136.0, "reward": 0.4232383370399475, "reward_std": 0.0674186572432518, "rewards/gemini_judge_reward_func/mean": 0.1272321492433548, "rewards/gemini_judge_reward_func/std": 0.2437458485364914, "rewards/semantic_correctness_reward_func/mean": 0.43172726035118103, "rewards/semantic_correctness_reward_func/std": 0.19425256550312042, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 155.0, "completions/mean_terminated_length": 143.20362854003906, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.041312790747300585, "grad_norm": 0.023077256977558136, "kl": 0.015984773635864258, "learning_rate": 9.908311110469881e-06, "loss": 0.0032, "num_tokens": 43095476.0, "reward": 0.4018684923648834, "reward_std": 0.06371606886386871, "rewards/gemini_judge_reward_func/mean": 0.1116071417927742, "rewards/gemini_judge_reward_func/std": 0.23037132620811462, "rewards/semantic_correctness_reward_func/mean": 0.4276280999183655, "rewards/semantic_correctness_reward_func/std": 0.2191450297832489, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853896975517273, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 156.51339721679688, "completions/mean_terminated_length": 148.6981964111328, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.041654218770005545, "grad_norm": 0.02264971099793911, "kl": 0.01440286636352539, "learning_rate": 9.905132290792395e-06, "loss": -0.0204, "num_tokens": 43427135.0, "reward": 0.4580499231815338, "reward_std": 0.06534235179424286, "rewards/gemini_judge_reward_func/mean": 0.1383928507566452, "rewards/gemini_judge_reward_func/std": 0.2535383701324463, "rewards/semantic_correctness_reward_func/mean": 0.44046369194984436, "rewards/semantic_correctness_reward_func/std": 0.21319182217121124, "rewards/xmlcount_reward_func/mean": 0.786500096321106, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 175.90179443359375, "completions/mean_terminated_length": 152.55963134765625, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.04199564679271051, "grad_norm": 0.0236386526376009, "kl": 0.013439178466796875, "learning_rate": 9.901899829374048e-06, "loss": -0.0155, "num_tokens": 43793809.0, "reward": 0.41775208711624146, "reward_std": 0.0679284930229187, "rewards/gemini_judge_reward_func/mean": 0.1049107164144516, "rewards/gemini_judge_reward_func/std": 0.24407413601875305, "rewards/semantic_correctness_reward_func/mean": 0.41318878531455994, "rewards/semantic_correctness_reward_func/std": 0.20918205380439758, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 159.7991180419922, "completions/mean_terminated_length": 140.0684814453125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.042337074815415474, "grad_norm": 0.02372078038752079, "kl": 0.014812469482421875, "learning_rate": 9.89861376156452e-06, "loss": -0.0073, "num_tokens": 44142160.0, "reward": 0.4198228418827057, "reward_std": 0.06786998361349106, "rewards/gemini_judge_reward_func/mean": 0.1551339328289032, "rewards/gemini_judge_reward_func/std": 0.28249844908714294, "rewards/semantic_correctness_reward_func/mean": 0.4303463101387024, "rewards/semantic_correctness_reward_func/std": 0.21610724925994873, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 156.77679443359375, "completions/mean_terminated_length": 148.96397399902344, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.04267850283812044, "grad_norm": 0.024051638320088387, "kl": 0.014676809310913086, "learning_rate": 9.895274123299724e-06, "loss": -0.0017, "num_tokens": 44485938.0, "reward": 0.42380136251449585, "reward_std": 0.060133740305900574, "rewards/gemini_judge_reward_func/mean": 0.1116071417927742, "rewards/gemini_judge_reward_func/std": 0.2178659439086914, "rewards/semantic_correctness_reward_func/mean": 0.3942924439907074, "rewards/semantic_correctness_reward_func/std": 0.22006738185882568, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 168.0625, "completions/mean_terminated_length": 156.44345092773438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.0430199308608254, "grad_norm": 0.023526515811681747, "kl": 0.012475728988647461, "learning_rate": 9.891880951101407e-06, "loss": 0.0064, "num_tokens": 44856932.0, "reward": 0.40895187854766846, "reward_std": 0.0454825833439827, "rewards/gemini_judge_reward_func/mean": 0.0870535746216774, "rewards/gemini_judge_reward_func/std": 0.19867786765098572, "rewards/semantic_correctness_reward_func/mean": 0.4026699662208557, "rewards/semantic_correctness_reward_func/std": 0.20043903589248657, "rewards/xmlcount_reward_func/mean": 0.7339910864830017, "rewards/xmlcount_reward_func/std": 0.44274044036865234, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 162.9375, "completions/mean_terminated_length": 147.2818145751953, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.04336135888353036, "grad_norm": 0.02356693334877491, "kl": 0.01137542724609375, "learning_rate": 9.888434282076759e-06, "loss": -0.0299, "num_tokens": 45207042.0, "reward": 0.42898255586624146, "reward_std": 0.05791693180799484, "rewards/gemini_judge_reward_func/mean": 0.0948660746216774, "rewards/gemini_judge_reward_func/std": 0.20417828857898712, "rewards/semantic_correctness_reward_func/mean": 0.38218027353286743, "rewards/semantic_correctness_reward_func/std": 0.1872968077659607, "rewards/xmlcount_reward_func/mean": 0.786500096321106, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 157.73214721679688, "completions/mean_terminated_length": 145.9728546142578, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.04370278690623533, "grad_norm": 0.023837530985474586, "kl": 0.013489961624145508, "learning_rate": 9.884934153917998e-06, "loss": -0.0031, "num_tokens": 45548426.0, "reward": 0.4394071698188782, "reward_std": 0.08121853321790695, "rewards/gemini_judge_reward_func/mean": 0.1116071417927742, "rewards/gemini_judge_reward_func/std": 0.23990675806999207, "rewards/semantic_correctness_reward_func/mean": 0.4495357573032379, "rewards/semantic_correctness_reward_func/std": 0.20210714638233185, "rewards/xmlcount_reward_func/mean": 0.7621428370475769, "rewards/xmlcount_reward_func/std": 0.483738511800766, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 151.5803680419922, "completions/mean_terminated_length": 147.6681671142578, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.04404421492894029, "grad_norm": 0.024463778361678123, "kl": 0.014742374420166016, "learning_rate": 9.881380604901964e-06, "loss": -0.0051, "num_tokens": 45899380.0, "reward": 0.468868613243103, "reward_std": 0.06673333793878555, "rewards/gemini_judge_reward_func/mean": 0.1372767835855484, "rewards/gemini_judge_reward_func/std": 0.22798827290534973, "rewards/semantic_correctness_reward_func/mean": 0.46103930473327637, "rewards/semantic_correctness_reward_func/std": 0.2001686841249466, "rewards/xmlcount_reward_func/mean": 0.8043750524520874, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 169.2991180419922, "completions/mean_terminated_length": 141.7281036376953, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.04438564295164526, "grad_norm": 0.02389535680413246, "kl": 0.013713598251342773, "learning_rate": 9.877773673889702e-06, "loss": 0.0058, "num_tokens": 46278347.0, "reward": 0.4204433560371399, "reward_std": 0.0702306404709816, "rewards/gemini_judge_reward_func/mean": 0.1439732164144516, "rewards/gemini_judge_reward_func/std": 0.235991969704628, "rewards/semantic_correctness_reward_func/mean": 0.44755586981773376, "rewards/semantic_correctness_reward_func/std": 0.216790109872818, "rewards/xmlcount_reward_func/mean": 0.6833571791648865, "rewards/xmlcount_reward_func/std": 0.46658626198768616, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 155.3169708251953, "completions/mean_terminated_length": 151.42153930664062, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.04472707097435022, "grad_norm": 0.023298032581806183, "kl": 0.011845588684082031, "learning_rate": 9.874113400326031e-06, "loss": -0.0069, "num_tokens": 46584138.0, "reward": 0.5024296641349792, "reward_std": 0.07264265418052673, "rewards/gemini_judge_reward_func/mean": 0.1272321492433548, "rewards/gemini_judge_reward_func/std": 0.2560819387435913, "rewards/semantic_correctness_reward_func/mean": 0.4527016580104828, "rewards/semantic_correctness_reward_func/std": 0.20331206917762756, "rewards/xmlcount_reward_func/mean": 0.9024911522865295, "rewards/xmlcount_reward_func/std": 0.29851844906806946, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 1024.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 183.74554443359375, "completions/mean_terminated_length": 152.625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.04506849899705518, "grad_norm": 0.022067122161388397, "kl": 0.019008159637451172, "learning_rate": 9.870399824239116e-06, "loss": -0.0057, "num_tokens": 46960697.0, "reward": 0.3944554924964905, "reward_std": 0.060083404183387756, "rewards/gemini_judge_reward_func/mean": 0.0948660746216774, "rewards/gemini_judge_reward_func/std": 0.2082555890083313, "rewards/semantic_correctness_reward_func/mean": 0.4240451157093048, "rewards/semantic_correctness_reward_func/std": 0.2055032104253769, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 161.64732360839844, "completions/mean_terminated_length": 149.94117736816406, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.04540992701976015, "grad_norm": 0.023691849783062935, "kl": 0.011993408203125, "learning_rate": 9.86663298624003e-06, "loss": -0.0322, "num_tokens": 47315422.0, "reward": 0.48760873079299927, "reward_std": 0.07489056885242462, "rewards/gemini_judge_reward_func/mean": 0.1428571492433548, "rewards/gemini_judge_reward_func/std": 0.25981688499450684, "rewards/semantic_correctness_reward_func/mean": 0.43856117129325867, "rewards/semantic_correctness_reward_func/std": 0.21570587158203125, "rewards/xmlcount_reward_func/mean": 0.8568840026855469, "rewards/xmlcount_reward_func/std": 0.3510022759437561, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 167.0357208251953, "completions/mean_terminated_length": 155.40272521972656, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.04575135504246511, "grad_norm": 0.021651828661561012, "kl": 0.0135650634765625, "learning_rate": 9.86281292752231e-06, "loss": -0.0115, "num_tokens": 47674206.0, "reward": 0.4171735942363739, "reward_std": 0.0660734549164772, "rewards/gemini_judge_reward_func/mean": 0.1272321492433548, "rewards/gemini_judge_reward_func/std": 0.28012555837631226, "rewards/semantic_correctness_reward_func/mean": 0.4371536672115326, "rewards/semantic_correctness_reward_func/std": 0.21627959609031677, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903892993927, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 149.97769165039062, "completions/mean_terminated_length": 149.97769165039062, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.046092783065170076, "grad_norm": 0.02360036037862301, "kl": 0.01648855209350586, "learning_rate": 9.858939689861506e-06, "loss": -0.0124, "num_tokens": 48012965.0, "reward": 0.44242221117019653, "reward_std": 0.07747457921504974, "rewards/gemini_judge_reward_func/mean": 0.1573660671710968, "rewards/gemini_judge_reward_func/std": 0.25402894616127014, "rewards/semantic_correctness_reward_func/mean": 0.44056612253189087, "rewards/semantic_correctness_reward_func/std": 0.23137415945529938, "rewards/xmlcount_reward_func/mean": 0.7284063100814819, "rewards/xmlcount_reward_func/std": 0.4465975761413574, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 155.32589721679688, "completions/mean_terminated_length": 147.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.04643421108787504, "grad_norm": 0.022543828934431076, "kl": 0.0161590576171875, "learning_rate": 9.855013315614725e-06, "loss": -0.0333, "num_tokens": 48372202.0, "reward": 0.4096542000770569, "reward_std": 0.06795641779899597, "rewards/gemini_judge_reward_func/mean": 0.1104910746216774, "rewards/gemini_judge_reward_func/std": 0.23392580449581146, "rewards/semantic_correctness_reward_func/mean": 0.40622615814208984, "rewards/semantic_correctness_reward_func/std": 0.22462278604507446, "rewards/xmlcount_reward_func/mean": 0.7105312943458557, "rewards/xmlcount_reward_func/std": 0.4553159773349762, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 157.4375, "completions/mean_terminated_length": 153.55157470703125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.046775639110580004, "grad_norm": 0.021333523094654083, "kl": 0.012816905975341797, "learning_rate": 9.851033847720167e-06, "loss": -0.0072, "num_tokens": 48754868.0, "reward": 0.42013928294181824, "reward_std": 0.0754006877541542, "rewards/gemini_judge_reward_func/mean": 0.1305803507566452, "rewards/gemini_judge_reward_func/std": 0.27092286944389343, "rewards/semantic_correctness_reward_func/mean": 0.4452856183052063, "rewards/semantic_correctness_reward_func/std": 0.2089463472366333, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903892993927, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 159.84375, "completions/mean_terminated_length": 148.11312866210938, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.047117067133284965, "grad_norm": 0.02194381318986416, "kl": 0.011771917343139648, "learning_rate": 9.847001329696653e-06, "loss": 0.0087, "num_tokens": 49104497.0, "reward": 0.4554777145385742, "reward_std": 0.058295100927352905, "rewards/gemini_judge_reward_func/mean": 0.1305803507566452, "rewards/gemini_judge_reward_func/std": 0.22453762590885162, "rewards/semantic_correctness_reward_func/mean": 0.44322773814201355, "rewards/semantic_correctness_reward_func/std": 0.18111221492290497, "rewards/xmlcount_reward_func/mean": 0.786500096321106, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 150.88839721679688, "completions/mean_terminated_length": 146.9730987548828, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.047458495155989926, "grad_norm": 0.021744275465607643, "kl": 0.014683246612548828, "learning_rate": 9.842915805643156e-06, "loss": -0.0157, "num_tokens": 49473884.0, "reward": 0.4144614338874817, "reward_std": 0.06900496780872345, "rewards/gemini_judge_reward_func/mean": 0.1808035671710968, "rewards/gemini_judge_reward_func/std": 0.2961066961288452, "rewards/semantic_correctness_reward_func/mean": 0.447878360748291, "rewards/semantic_correctness_reward_func/std": 0.19721916317939758, "rewards/xmlcount_reward_func/mean": 0.6314107179641724, "rewards/xmlcount_reward_func/std": 0.48076197504997253, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 168.02679443359375, "completions/mean_terminated_length": 144.46788024902344, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.04779992317869489, "grad_norm": 0.021885110065340996, "kl": 0.015885353088378906, "learning_rate": 9.838777320238312e-06, "loss": -0.0297, "num_tokens": 49867834.0, "reward": 0.39027801156044006, "reward_std": 0.06917236000299454, "rewards/gemini_judge_reward_func/mean": 0.1272321492433548, "rewards/gemini_judge_reward_func/std": 0.24603478610515594, "rewards/semantic_correctness_reward_func/mean": 0.43455058336257935, "rewards/semantic_correctness_reward_func/std": 0.20599418878555298, "rewards/xmlcount_reward_func/mean": 0.6311875581741333, "rewards/xmlcount_reward_func/std": 0.48217126727104187, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 164.79464721679688, "completions/mean_terminated_length": 149.1727294921875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.048141351201399854, "grad_norm": 0.021948551759123802, "kl": 0.014840126037597656, "learning_rate": 9.834585918739936e-06, "loss": 0.0182, "num_tokens": 50228388.0, "reward": 0.4530205726623535, "reward_std": 0.08578796684741974, "rewards/gemini_judge_reward_func/mean": 0.1796875, "rewards/gemini_judge_reward_func/std": 0.30927425622940063, "rewards/semantic_correctness_reward_func/mean": 0.4399777054786682, "rewards/semantic_correctness_reward_func/std": 0.2128545343875885, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 152.30357360839844, "completions/mean_terminated_length": 144.45045471191406, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.04848277922410482, "grad_norm": 0.02323133312165737, "kl": 0.015910625457763672, "learning_rate": 9.830341646984521e-06, "loss": 0.0111, "num_tokens": 50582596.0, "reward": 0.44822028279304504, "reward_std": 0.06913460791110992, "rewards/gemini_judge_reward_func/mean": 0.1607142835855484, "rewards/gemini_judge_reward_func/std": 0.27068495750427246, "rewards/semantic_correctness_reward_func/mean": 0.45392245054244995, "rewards/semantic_correctness_reward_func/std": 0.20474979281425476, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 167.79019165039062, "completions/mean_terminated_length": 148.24200439453125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.04882420724680978, "grad_norm": 0.022231949493288994, "kl": 0.013165473937988281, "learning_rate": 9.826044551386743e-06, "loss": -0.0225, "num_tokens": 50930225.0, "reward": 0.44765713810920715, "reward_std": 0.06493684649467468, "rewards/gemini_judge_reward_func/mean": 0.1328125, "rewards/gemini_judge_reward_func/std": 0.26138925552368164, "rewards/semantic_correctness_reward_func/mean": 0.4354102909564972, "rewards/semantic_correctness_reward_func/std": 0.2049492746591568, "rewards/xmlcount_reward_func/mean": 0.7686249613761902, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 742.0, "completions/mean_length": 168.24107360839844, "completions/mean_terminated_length": 148.70318603515625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.04916563526951474, "grad_norm": 0.02268117107450962, "kl": 0.01633310317993164, "learning_rate": 9.821694678938954e-06, "loss": -0.0143, "num_tokens": 51295311.0, "reward": 0.37321770191192627, "reward_std": 0.06530405580997467, "rewards/gemini_judge_reward_func/mean": 0.125, "rewards/gemini_judge_reward_func/std": 0.22083210945129395, "rewards/semantic_correctness_reward_func/mean": 0.4452759623527527, "rewards/semantic_correctness_reward_func/std": 0.19345305860042572, "rewards/xmlcount_reward_func/mean": 0.5854063034057617, "rewards/xmlcount_reward_func/std": 0.49435025453567505, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 162.8616180419922, "completions/mean_terminated_length": 151.1719512939453, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.04950706329221971, "grad_norm": 0.021489452570676804, "kl": 0.01521444320678711, "learning_rate": 9.817292077210658e-06, "loss": -0.012, "num_tokens": 51677272.0, "reward": 0.4057961404323578, "reward_std": 0.06886345148086548, "rewards/gemini_judge_reward_func/mean": 0.125, "rewards/gemini_judge_reward_func/std": 0.2516759932041168, "rewards/semantic_correctness_reward_func/mean": 0.4517931044101715, "rewards/semantic_correctness_reward_func/std": 0.20376348495483398, "rewards/xmlcount_reward_func/mean": 0.6635938286781311, "rewards/xmlcount_reward_func/std": 0.4730554521083832, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 168.55804443359375, "completions/mean_terminated_length": 153.00454711914062, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.04984849131492467, "grad_norm": 0.023336565122008324, "kl": 0.012263774871826172, "learning_rate": 9.812836794348005e-06, "loss": -0.0348, "num_tokens": 52045293.0, "reward": 0.4521057605743408, "reward_std": 0.07398687303066254, "rewards/gemini_judge_reward_func/mean": 0.1462053507566452, "rewards/gemini_judge_reward_func/std": 0.25739532709121704, "rewards/semantic_correctness_reward_func/mean": 0.47555527091026306, "rewards/semantic_correctness_reward_func/std": 0.2144700139760971, "rewards/xmlcount_reward_func/mean": 0.7462812662124634, "rewards/xmlcount_reward_func/std": 0.4369716942310333, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 158.34375, "completions/mean_terminated_length": 154.46189880371094, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.05018991933762964, "grad_norm": 0.022060193121433258, "kl": 0.014934062957763672, "learning_rate": 9.808328879073251e-06, "loss": -0.0244, "num_tokens": 52393254.0, "reward": 0.42671334743499756, "reward_std": 0.06478478014469147, "rewards/gemini_judge_reward_func/mean": 0.1283482164144516, "rewards/gemini_judge_reward_func/std": 0.25933241844177246, "rewards/semantic_correctness_reward_func/mean": 0.41112020611763, "rewards/semantic_correctness_reward_func/std": 0.1945081204175949, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 164.73214721679688, "completions/mean_terminated_length": 141.0825653076172, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.0505313473603346, "grad_norm": 0.02345438115298748, "kl": 0.016699790954589844, "learning_rate": 9.803768380684242e-06, "loss": -0.0423, "num_tokens": 52763070.0, "reward": 0.39287370443344116, "reward_std": 0.0494573637843132, "rewards/gemini_judge_reward_func/mean": 0.125, "rewards/gemini_judge_reward_func/std": 0.26043254137039185, "rewards/semantic_correctness_reward_func/mean": 0.3916184902191162, "rewards/semantic_correctness_reward_func/std": 0.2100548893213272, "rewards/xmlcount_reward_func/mean": 0.6613750457763672, "rewards/xmlcount_reward_func/std": 0.47500187158584595, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 154.75, "completions/mean_terminated_length": 146.91893005371094, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.05087277538303956, "grad_norm": 0.02226773090660572, "kl": 0.015322446823120117, "learning_rate": 9.79915534905385e-06, "loss": -0.0125, "num_tokens": 53114594.0, "reward": 0.45522111654281616, "reward_std": 0.07392917573451996, "rewards/gemini_judge_reward_func/mean": 0.1852678507566452, "rewards/gemini_judge_reward_func/std": 0.28459224104881287, "rewards/semantic_correctness_reward_func/mean": 0.4398198127746582, "rewards/semantic_correctness_reward_func/std": 0.2161870151758194, "rewards/xmlcount_reward_func/mean": 0.7328750491142273, "rewards/xmlcount_reward_func/std": 0.44427841901779175, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 166.0803680419922, "completions/mean_terminated_length": 150.4818115234375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.05121420340574453, "grad_norm": 0.02213941141963005, "kl": 0.014892578125, "learning_rate": 9.794489834629457e-06, "loss": 0.0032, "num_tokens": 53466324.0, "reward": 0.4585185945034027, "reward_std": 0.08465974032878876, "rewards/gemini_judge_reward_func/mean": 0.140625, "rewards/gemini_judge_reward_func/std": 0.27259188890457153, "rewards/semantic_correctness_reward_func/mean": 0.4383428394794464, "rewards/semantic_correctness_reward_func/std": 0.21441145241260529, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 146.16964721679688, "completions/mean_terminated_length": 138.26126098632812, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.05155563142844949, "grad_norm": 0.02260763570666313, "kl": 0.014892101287841797, "learning_rate": 9.789771888432375e-06, "loss": 0.0059, "num_tokens": 53833862.0, "reward": 0.38839536905288696, "reward_std": 0.06746162474155426, "rewards/gemini_judge_reward_func/mean": 0.1272321492433548, "rewards/gemini_judge_reward_func/std": 0.24830262362957, "rewards/semantic_correctness_reward_func/mean": 0.4273248314857483, "rewards/semantic_correctness_reward_func/std": 0.20607294142246246, "rewards/xmlcount_reward_func/mean": 0.6300938725471497, "rewards/xmlcount_reward_func/std": 0.48451390862464905, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 168.9732208251953, "completions/mean_terminated_length": 149.45204162597656, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.051897059451154456, "grad_norm": 0.022246094420552254, "kl": 0.013828754425048828, "learning_rate": 9.785001562057311e-06, "loss": -0.0048, "num_tokens": 54173988.0, "reward": 0.424908310174942, "reward_std": 0.06946201622486115, "rewards/gemini_judge_reward_func/mean": 0.1350446492433548, "rewards/gemini_judge_reward_func/std": 0.2602379024028778, "rewards/semantic_correctness_reward_func/mean": 0.4132825434207916, "rewards/semantic_correctness_reward_func/std": 0.20428363978862762, "rewards/xmlcount_reward_func/mean": 0.7205848693847656, "rewards/xmlcount_reward_func/std": 0.44707995653152466, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 160.70982360839844, "completions/mean_terminated_length": 141.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.05223848747385942, "grad_norm": 0.022493815049529076, "kl": 0.015873432159423828, "learning_rate": 9.780178907671788e-06, "loss": -0.018, "num_tokens": 54559591.0, "reward": 0.36641088128089905, "reward_std": 0.05886705592274666, "rewards/gemini_judge_reward_func/mean": 0.0725446417927742, "rewards/gemini_judge_reward_func/std": 0.17885597050189972, "rewards/semantic_correctness_reward_func/mean": 0.400027334690094, "rewards/semantic_correctness_reward_func/std": 0.185530886054039, "rewards/xmlcount_reward_func/mean": 0.6434687376022339, "rewards/xmlcount_reward_func/std": 0.4783778190612793, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 163.7857208251953, "completions/mean_terminated_length": 152.1085968017578, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.05257991549656438, "grad_norm": 0.02088047005236149, "kl": 0.013623714447021484, "learning_rate": 9.775303978015585e-06, "loss": 0.003, "num_tokens": 54911635.0, "reward": 0.47129741311073303, "reward_std": 0.0827810987830162, "rewards/gemini_judge_reward_func/mean": 0.1551339328289032, "rewards/gemini_judge_reward_func/std": 0.26509660482406616, "rewards/semantic_correctness_reward_func/mean": 0.46478161215782166, "rewards/semantic_correctness_reward_func/std": 0.22101202607154846, "rewards/xmlcount_reward_func/mean": 0.7907187342643738, "rewards/xmlcount_reward_func/std": 0.40539026260375977, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 170.61607360839844, "completions/mean_terminated_length": 147.12843322753906, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.052921343519269345, "grad_norm": 0.021672114729881287, "kl": 0.016643762588500977, "learning_rate": 9.77037682640015e-06, "loss": -0.0197, "num_tokens": 55285241.0, "reward": 0.38316836953163147, "reward_std": 0.05640144646167755, "rewards/gemini_judge_reward_func/mean": 0.1205357164144516, "rewards/gemini_judge_reward_func/std": 0.2169451117515564, "rewards/semantic_correctness_reward_func/mean": 0.42352014780044556, "rewards/semantic_correctness_reward_func/std": 0.21736378967761993, "rewards/xmlcount_reward_func/mean": 0.6256250143051147, "rewards/xmlcount_reward_func/std": 0.48569241166114807, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 158.07144165039062, "completions/mean_terminated_length": 154.1883544921875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.053262771541974306, "grad_norm": 0.021647725254297256, "kl": 0.011748313903808594, "learning_rate": 9.765397506708023e-06, "loss": 0.0024, "num_tokens": 55603469.0, "reward": 0.4995449483394623, "reward_std": 0.06327866017818451, "rewards/gemini_judge_reward_func/mean": 0.1227678582072258, "rewards/gemini_judge_reward_func/std": 0.22953340411186218, "rewards/semantic_correctness_reward_func/mean": 0.4289388656616211, "rewards/semantic_correctness_reward_func/std": 0.19935138523578644, "rewards/xmlcount_reward_func/mean": 0.9116250276565552, "rewards/xmlcount_reward_func/std": 0.28608015179634094, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 167.4732208251953, "completions/mean_terminated_length": 155.84616088867188, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.053604199564679274, "grad_norm": 0.0213526152074337, "kl": 0.01268625259399414, "learning_rate": 9.760366073392246e-06, "loss": -0.0418, "num_tokens": 55956707.0, "reward": 0.44403091073036194, "reward_std": 0.07015395164489746, "rewards/gemini_judge_reward_func/mean": 0.1473214328289032, "rewards/gemini_judge_reward_func/std": 0.24616695940494537, "rewards/semantic_correctness_reward_func/mean": 0.42401161789894104, "rewards/semantic_correctness_reward_func/std": 0.20338886976242065, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 153.09375, "completions/mean_terminated_length": 149.1883544921875, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.053945627587384234, "grad_norm": 0.022866856306791306, "kl": 0.013154983520507812, "learning_rate": 9.755282581475769e-06, "loss": -0.0114, "num_tokens": 56308040.0, "reward": 0.4244222044944763, "reward_std": 0.06508694589138031, "rewards/gemini_judge_reward_func/mean": 0.1071428582072258, "rewards/gemini_judge_reward_func/std": 0.20562243461608887, "rewards/semantic_correctness_reward_func/mean": 0.44207531213760376, "rewards/semantic_correctness_reward_func/std": 0.21528586745262146, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 166.02232360839844, "completions/mean_terminated_length": 154.37557983398438, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.054287055610089195, "grad_norm": 0.020635830238461494, "kl": 0.014980077743530273, "learning_rate": 9.750147086550843e-06, "loss": -0.0214, "num_tokens": 56668957.0, "reward": 0.41711556911468506, "reward_std": 0.052652522921562195, "rewards/gemini_judge_reward_func/mean": 0.0970982164144516, "rewards/gemini_judge_reward_func/std": 0.21123822033405304, "rewards/semantic_correctness_reward_func/mean": 0.3898812234401703, "rewards/semantic_correctness_reward_func/std": 0.1925191879272461, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 169.67857360839844, "completions/mean_terminated_length": 154.14544677734375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.05462848363279416, "grad_norm": 0.02253010869026184, "kl": 0.014844894409179688, "learning_rate": 9.744959644778422e-06, "loss": -0.0133, "num_tokens": 57031605.0, "reward": 0.40882542729377747, "reward_std": 0.07083141058683395, "rewards/gemini_judge_reward_func/mean": 0.1305803507566452, "rewards/gemini_judge_reward_func/std": 0.2459813803434372, "rewards/semantic_correctness_reward_func/mean": 0.4065912067890167, "rewards/semantic_correctness_reward_func/std": 0.18288636207580566, "rewards/xmlcount_reward_func/mean": 0.6881875395774841, "rewards/xmlcount_reward_func/std": 0.46501508355140686, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 166.9553680419922, "completions/mean_terminated_length": 155.32127380371094, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.054969911655499124, "grad_norm": 0.021544380113482475, "kl": 0.013626575469970703, "learning_rate": 9.739720312887536e-06, "loss": -0.0282, "num_tokens": 57369867.0, "reward": 0.44754543900489807, "reward_std": 0.07036061584949493, "rewards/gemini_judge_reward_func/mean": 0.1194196417927742, "rewards/gemini_judge_reward_func/std": 0.24253910779953003, "rewards/semantic_correctness_reward_func/mean": 0.4258878231048584, "rewards/semantic_correctness_reward_func/std": 0.1886364221572876, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 164.01339721679688, "completions/mean_terminated_length": 144.3789825439453, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.05531133967820409, "grad_norm": 0.021095486357808113, "kl": 0.016514301300048828, "learning_rate": 9.734429148174676e-06, "loss": -0.0261, "num_tokens": 57756406.0, "reward": 0.35722407698631287, "reward_std": 0.07497614622116089, "rewards/gemini_judge_reward_func/mean": 0.1350446492433548, "rewards/gemini_judge_reward_func/std": 0.27490234375, "rewards/semantic_correctness_reward_func/mean": 0.40778082609176636, "rewards/semantic_correctness_reward_func/std": 0.1879914402961731, "rewards/xmlcount_reward_func/mean": 0.5541250109672546, "rewards/xmlcount_reward_func/std": 0.49873343110084534, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 173.91519165039062, "completions/mean_terminated_length": 162.37557983398438, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.05565276770090905, "grad_norm": 0.022490430623292923, "kl": 0.0135345458984375, "learning_rate": 9.729086208503174e-06, "loss": -0.0096, "num_tokens": 58128191.0, "reward": 0.4325447976589203, "reward_std": 0.05169191583991051, "rewards/gemini_judge_reward_func/mean": 0.0892857164144516, "rewards/gemini_judge_reward_func/std": 0.17189638316631317, "rewards/semantic_correctness_reward_func/mean": 0.446902334690094, "rewards/semantic_correctness_reward_func/std": 0.18784944713115692, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 164.34375, "completions/mean_terminated_length": 152.67420959472656, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.05599419572361401, "grad_norm": 0.0230224821716547, "kl": 0.013494491577148438, "learning_rate": 9.723691552302563e-06, "loss": -0.008, "num_tokens": 58458552.0, "reward": 0.484989196062088, "reward_std": 0.07875213027000427, "rewards/gemini_judge_reward_func/mean": 0.1506696492433548, "rewards/gemini_judge_reward_func/std": 0.2547961473464966, "rewards/semantic_correctness_reward_func/mean": 0.44335657358169556, "rewards/semantic_correctness_reward_func/std": 0.19373531639575958, "rewards/xmlcount_reward_func/mean": 0.8401250839233398, "rewards/xmlcount_reward_func/std": 0.3684578537940979, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 150.0669708251953, "completions/mean_terminated_length": 138.20362854003906, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.05633562374631898, "grad_norm": 0.02238614670932293, "kl": 0.016913890838623047, "learning_rate": 9.718245238567939e-06, "loss": -0.0162, "num_tokens": 58824655.0, "reward": 0.38861343264579773, "reward_std": 0.06800764799118042, "rewards/gemini_judge_reward_func/mean": 0.1316964328289032, "rewards/gemini_judge_reward_func/std": 0.258184015750885, "rewards/semantic_correctness_reward_func/mean": 0.39789751172065735, "rewards/semantic_correctness_reward_func/std": 0.21606741845607758, "rewards/xmlcount_reward_func/mean": 0.6408883333206177, "rewards/xmlcount_reward_func/std": 0.480337917804718, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 157.02232360839844, "completions/mean_terminated_length": 149.2117156982422, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.05667705176902394, "grad_norm": 0.021430594846606255, "kl": 0.01669931411743164, "learning_rate": 9.712747326859316e-06, "loss": -0.0294, "num_tokens": 59189008.0, "reward": 0.37962663173675537, "reward_std": 0.059822093695402145, "rewards/gemini_judge_reward_func/mean": 0.1060267835855484, "rewards/gemini_judge_reward_func/std": 0.2311926931142807, "rewards/semantic_correctness_reward_func/mean": 0.3990795314311981, "rewards/semantic_correctness_reward_func/std": 0.21964147686958313, "rewards/xmlcount_reward_func/mean": 0.643500030040741, "rewards/xmlcount_reward_func/std": 0.48071083426475525, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 178.00894165039062, "completions/mean_terminated_length": 154.72476196289062, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.05701847979172891, "grad_norm": 0.021441694349050522, "kl": 0.013471364974975586, "learning_rate": 9.707197877300974e-06, "loss": 0.0005, "num_tokens": 59532526.0, "reward": 0.46267572045326233, "reward_std": 0.08093573898077011, "rewards/gemini_judge_reward_func/mean": 0.1383928507566452, "rewards/gemini_judge_reward_func/std": 0.26541972160339355, "rewards/semantic_correctness_reward_func/mean": 0.4269498884677887, "rewards/semantic_correctness_reward_func/std": 0.22219188511371613, "rewards/xmlcount_reward_func/mean": 0.8048214316368103, "rewards/xmlcount_reward_func/std": 0.39773449301719666, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 153.24107360839844, "completions/mean_terminated_length": 145.39639282226562, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.05735990781443387, "grad_norm": 0.022561749443411827, "kl": 0.01429438591003418, "learning_rate": 9.701596950580807e-06, "loss": 0.0165, "num_tokens": 59871796.0, "reward": 0.4066465198993683, "reward_std": 0.06362809985876083, "rewards/gemini_judge_reward_func/mean": 0.1194196417927742, "rewards/gemini_judge_reward_func/std": 0.20902319252490997, "rewards/semantic_correctness_reward_func/mean": 0.4001430869102478, "rewards/semantic_correctness_reward_func/std": 0.19026781618595123, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903892993927, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 159.53125, "completions/mean_terminated_length": 155.6547088623047, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.05770133583713883, "grad_norm": 0.022561749443411827, "kl": 0.013838768005371094, "learning_rate": 9.701596950580807e-06, "loss": -0.0069, "num_tokens": 60205287.0, "reward": 0.43320146203041077, "reward_std": 0.052795667201280594, "rewards/gemini_judge_reward_func/mean": 0.0859375, "rewards/gemini_judge_reward_func/std": 0.17600706219673157, "rewards/semantic_correctness_reward_func/mean": 0.38538211584091187, "rewards/semantic_correctness_reward_func/std": 0.2128061205148697, "rewards/xmlcount_reward_func/mean": 0.8043751120567322, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 163.66964721679688, "completions/mean_terminated_length": 155.91893005371094, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.0580427638598438, "grad_norm": 0.022247465327382088, "kl": 0.011807918548583984, "learning_rate": 9.69594460794965e-06, "loss": -0.0284, "num_tokens": 60531329.0, "reward": 0.4758225083351135, "reward_std": 0.05295069143176079, "rewards/gemini_judge_reward_func/mean": 0.1383928507566452, "rewards/gemini_judge_reward_func/std": 0.23990674316883087, "rewards/semantic_correctness_reward_func/mean": 0.44310349225997925, "rewards/semantic_correctness_reward_func/std": 0.22307425737380981, "rewards/xmlcount_reward_func/mean": 0.8296116590499878, "rewards/xmlcount_reward_func/std": 0.3723537027835846, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 157.7053680419922, "completions/mean_terminated_length": 145.94570922851562, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.05838419188254876, "grad_norm": 0.02209446392953396, "kl": 0.01704883575439453, "learning_rate": 9.690240911220618e-06, "loss": -0.0096, "num_tokens": 60891823.0, "reward": 0.4018482267856598, "reward_std": 0.06590615957975388, "rewards/gemini_judge_reward_func/mean": 0.1160714253783226, "rewards/gemini_judge_reward_func/std": 0.23540008068084717, "rewards/semantic_correctness_reward_func/mean": 0.4185981750488281, "rewards/semantic_correctness_reward_func/std": 0.20078440010547638, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 162.10269165039062, "completions/mean_terminated_length": 142.42465209960938, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.058725619905253726, "grad_norm": 0.023919543251395226, "kl": 0.018400192260742188, "learning_rate": 9.684485922768422e-06, "loss": -0.0181, "num_tokens": 61264334.0, "reward": 0.40431568026542664, "reward_std": 0.055412210524082184, "rewards/gemini_judge_reward_func/mean": 0.1383928507566452, "rewards/gemini_judge_reward_func/std": 0.2716815173625946, "rewards/semantic_correctness_reward_func/mean": 0.4220426380634308, "rewards/semantic_correctness_reward_func/std": 0.22644221782684326, "rewards/xmlcount_reward_func/mean": 0.6613750457763672, "rewards/xmlcount_reward_func/std": 0.47500187158584595, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 781.0, "completions/mean_length": 164.0625, "completions/mean_terminated_length": 156.31532287597656, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.05906704792795869, "grad_norm": 0.021184051409363747, "kl": 0.013810157775878906, "learning_rate": 9.678679705528699e-06, "loss": -0.0254, "num_tokens": 61618052.0, "reward": 0.43104228377342224, "reward_std": 0.06282518804073334, "rewards/gemini_judge_reward_func/mean": 0.1116071417927742, "rewards/gemini_judge_reward_func/std": 0.19904279708862305, "rewards/semantic_correctness_reward_func/mean": 0.44841665029525757, "rewards/semantic_correctness_reward_func/std": 0.19527798891067505, "rewards/xmlcount_reward_func/mean": 0.741790235042572, "rewards/xmlcount_reward_func/std": 0.4394533038139343, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 141.54464721679688, "completions/mean_terminated_length": 141.54464721679688, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.05940847595066365, "grad_norm": 0.023413272574543953, "kl": 0.014312744140625, "learning_rate": 9.672822322997305e-06, "loss": 0.0039, "num_tokens": 61970298.0, "reward": 0.4327978491783142, "reward_std": 0.06107241287827492, "rewards/gemini_judge_reward_func/mean": 0.1350446492433548, "rewards/gemini_judge_reward_func/std": 0.26451072096824646, "rewards/semantic_correctness_reward_func/mean": 0.4281497001647949, "rewards/semantic_correctness_reward_func/std": 0.20419283211231232, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 160.125, "completions/mean_terminated_length": 156.25112915039062, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.059749903973368615, "grad_norm": 0.021777305752038956, "kl": 0.014169454574584961, "learning_rate": 9.666913839229639e-06, "loss": 0.0003, "num_tokens": 62331346.0, "reward": 0.4691229462623596, "reward_std": 0.07057663053274155, "rewards/gemini_judge_reward_func/mean": 0.1238839253783226, "rewards/gemini_judge_reward_func/std": 0.23075933754444122, "rewards/semantic_correctness_reward_func/mean": 0.45334672927856445, "rewards/semantic_correctness_reward_func/std": 0.21422763168811798, "rewards/xmlcount_reward_func/mean": 0.8222500681877136, "rewards/xmlcount_reward_func/std": 0.3842346966266632, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 155.46429443359375, "completions/mean_terminated_length": 151.56951904296875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.060091331996073576, "grad_norm": 0.025058435276150703, "kl": 0.030515670776367188, "learning_rate": 9.660954318839934e-06, "loss": 0.0045, "num_tokens": 62678126.0, "reward": 0.46425560116767883, "reward_std": 0.057500917464494705, "rewards/gemini_judge_reward_func/mean": 0.1339285671710968, "rewards/gemini_judge_reward_func/std": 0.25483787059783936, "rewards/semantic_correctness_reward_func/mean": 0.44467073678970337, "rewards/semantic_correctness_reward_func/std": 0.22635166347026825, "rewards/xmlcount_reward_func/mean": 0.8043750524520874, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 158.9419708251953, "completions/mean_terminated_length": 155.0627899169922, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.06043276001877854, "grad_norm": 0.02151457406580448, "kl": 0.014577388763427734, "learning_rate": 9.654943827000548e-06, "loss": -0.0264, "num_tokens": 63019985.0, "reward": 0.44926339387893677, "reward_std": 0.06779324263334274, "rewards/gemini_judge_reward_func/mean": 0.1127232164144516, "rewards/gemini_judge_reward_func/std": 0.22175651788711548, "rewards/semantic_correctness_reward_func/mean": 0.4478701949119568, "rewards/semantic_correctness_reward_func/std": 0.2044634222984314, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 164.93304443359375, "completions/mean_terminated_length": 157.19369506835938, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.060774188041483504, "grad_norm": 0.027191217988729477, "kl": 0.01347041130065918, "learning_rate": 9.648882429441258e-06, "loss": -0.0145, "num_tokens": 63344290.0, "reward": 0.46692782640457153, "reward_std": 0.06793376803398132, "rewards/gemini_judge_reward_func/mean": 0.1026785746216774, "rewards/gemini_judge_reward_func/std": 0.20789778232574463, "rewards/semantic_correctness_reward_func/mean": 0.4490319788455963, "rewards/semantic_correctness_reward_func/std": 0.21256029605865479, "rewards/xmlcount_reward_func/mean": 0.8401250243186951, "rewards/xmlcount_reward_func/std": 0.3684578537940979, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 148.0982208251953, "completions/mean_terminated_length": 148.0982208251953, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.06111561606418847, "grad_norm": 0.022637570276856422, "kl": 0.012826919555664062, "learning_rate": 9.642770192448537e-06, "loss": -0.0074, "num_tokens": 63716068.0, "reward": 0.45914244651794434, "reward_std": 0.0858091339468956, "rewards/gemini_judge_reward_func/mean": 0.1473214328289032, "rewards/gemini_judge_reward_func/std": 0.2583874762058258, "rewards/semantic_correctness_reward_func/mean": 0.4231850206851959, "rewards/semantic_correctness_reward_func/std": 0.21553608775138855, "rewards/xmlcount_reward_func/mean": 0.7889420390129089, "rewards/xmlcount_reward_func/std": 0.4050236642360687, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 153.35714721679688, "completions/mean_terminated_length": 153.35714721679688, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.06145704408689343, "grad_norm": 0.022738490253686905, "kl": 0.013818740844726562, "learning_rate": 9.636607182864828e-06, "loss": -0.0012, "num_tokens": 64067068.0, "reward": 0.47211530804634094, "reward_std": 0.05404976010322571, "rewards/gemini_judge_reward_func/mean": 0.1417410671710968, "rewards/gemini_judge_reward_func/std": 0.24774518609046936, "rewards/semantic_correctness_reward_func/mean": 0.45715656876564026, "rewards/semantic_correctness_reward_func/std": 0.2026260793209076, "rewards/xmlcount_reward_func/mean": 0.8099688291549683, "rewards/xmlcount_reward_func/std": 0.3902978301048279, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 150.27232360839844, "completions/mean_terminated_length": 146.35426330566406, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.06179847210959839, "grad_norm": 0.024008719250559807, "kl": 0.01700735092163086, "learning_rate": 9.630393468087818e-06, "loss": 0.0122, "num_tokens": 64438281.0, "reward": 0.4074629247188568, "reward_std": 0.053221508860588074, "rewards/gemini_judge_reward_func/mean": 0.0959821417927742, "rewards/gemini_judge_reward_func/std": 0.19159916043281555, "rewards/semantic_correctness_reward_func/mean": 0.4153500497341156, "rewards/semantic_correctness_reward_func/std": 0.1790420562028885, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 154.60714721679688, "completions/mean_terminated_length": 154.60714721679688, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.06213990013230336, "grad_norm": 0.022805478423833847, "kl": 0.012624263763427734, "learning_rate": 9.624129116069695e-06, "loss": -0.0175, "num_tokens": 64764149.0, "reward": 0.4789244532585144, "reward_std": 0.07191872596740723, "rewards/gemini_judge_reward_func/mean": 0.125, "rewards/gemini_judge_reward_func/std": 0.24718141555786133, "rewards/semantic_correctness_reward_func/mean": 0.4286222457885742, "rewards/semantic_correctness_reward_func/std": 0.22481898963451385, "rewards/xmlcount_reward_func/mean": 0.8580000996589661, "rewards/xmlcount_reward_func/std": 0.35106155276298523, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 146.19644165039062, "completions/mean_terminated_length": 146.19644165039062, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.06248132815500832, "grad_norm": 0.02222471684217453, "kl": 0.014984130859375, "learning_rate": 9.61781419531641e-06, "loss": -0.0111, "num_tokens": 65122957.0, "reward": 0.4180612862110138, "reward_std": 0.056327447295188904, "rewards/gemini_judge_reward_func/mean": 0.1049107164144516, "rewards/gemini_judge_reward_func/std": 0.20677535235881805, "rewards/semantic_correctness_reward_func/mean": 0.39460986852645874, "rewards/semantic_correctness_reward_func/std": 0.1847928911447525, "rewards/xmlcount_reward_func/mean": 0.7429375648498535, "rewards/xmlcount_reward_func/std": 0.43272262811660767, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 142.21875, "completions/mean_terminated_length": 142.21875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.06282275617771328, "grad_norm": 0.02309420332312584, "kl": 0.017095088958740234, "learning_rate": 9.611448774886925e-06, "loss": 0.0017, "num_tokens": 65481062.0, "reward": 0.40878623723983765, "reward_std": 0.05744129791855812, "rewards/gemini_judge_reward_func/mean": 0.1071428582072258, "rewards/gemini_judge_reward_func/std": 0.24194253981113434, "rewards/semantic_correctness_reward_func/mean": 0.39964547753334045, "rewards/semantic_correctness_reward_func/std": 0.1898690164089203, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 163.8482208251953, "completions/mean_terminated_length": 156.09910583496094, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.06316418420041825, "grad_norm": 0.02297145500779152, "kl": 0.012150287628173828, "learning_rate": 9.605032924392457e-06, "loss": -0.0288, "num_tokens": 65832704.0, "reward": 0.43201377987861633, "reward_std": 0.06907930970191956, "rewards/gemini_judge_reward_func/mean": 0.1138392835855484, "rewards/gemini_judge_reward_func/std": 0.2304907888174057, "rewards/semantic_correctness_reward_func/mean": 0.4308900535106659, "rewards/semantic_correctness_reward_func/std": 0.2030285894870758, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 150.8928680419922, "completions/mean_terminated_length": 150.8928680419922, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.06350561222312322, "grad_norm": 0.02199730835855007, "kl": 0.012223243713378906, "learning_rate": 9.598566713995718e-06, "loss": -0.006, "num_tokens": 66182992.0, "reward": 0.46267858147621155, "reward_std": 0.055922579020261765, "rewards/gemini_judge_reward_func/mean": 0.1395089328289032, "rewards/gemini_judge_reward_func/std": 0.23512086272239685, "rewards/semantic_correctness_reward_func/mean": 0.46137502789497375, "rewards/semantic_correctness_reward_func/std": 0.20330458879470825, "rewards/xmlcount_reward_func/mean": 0.786500096321106, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 161.57144165039062, "completions/mean_terminated_length": 149.8642578125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.06384704024582817, "grad_norm": 0.023103708401322365, "kl": 0.014789819717407227, "learning_rate": 9.592050214410152e-06, "loss": -0.0102, "num_tokens": 66515820.0, "reward": 0.43203839659690857, "reward_std": 0.06265180557966232, "rewards/gemini_judge_reward_func/mean": 0.1183035746216774, "rewards/gemini_judge_reward_func/std": 0.22575190663337708, "rewards/semantic_correctness_reward_func/mean": 0.4220845401287079, "rewards/semantic_correctness_reward_func/std": 0.2233872264623642, "rewards/xmlcount_reward_func/mean": 0.7507500648498535, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 155.8303680419922, "completions/mean_terminated_length": 140.0454559326172, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.06418846826853314, "grad_norm": 0.02313879318535328, "kl": 0.014795541763305664, "learning_rate": 9.585483496899151e-06, "loss": -0.0222, "num_tokens": 66879166.0, "reward": 0.44597339630126953, "reward_std": 0.07401511073112488, "rewards/gemini_judge_reward_func/mean": 0.1640625, "rewards/gemini_judge_reward_func/std": 0.3006584942340851, "rewards/semantic_correctness_reward_func/mean": 0.42083999514579773, "rewards/semantic_correctness_reward_func/std": 0.2109062224626541, "rewards/xmlcount_reward_func/mean": 0.740450918674469, "rewards/xmlcount_reward_func/std": 0.439119815826416, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 146.3482208251953, "completions/mean_terminated_length": 146.3482208251953, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.0645298962912381, "grad_norm": 0.022025736048817635, "kl": 0.014230012893676758, "learning_rate": 9.578866633275289e-06, "loss": -0.0026, "num_tokens": 67217452.0, "reward": 0.4548703730106354, "reward_std": 0.05710577219724655, "rewards/gemini_judge_reward_func/mean": 0.1004464253783226, "rewards/gemini_judge_reward_func/std": 0.22450698912143707, "rewards/semantic_correctness_reward_func/mean": 0.39320898056030273, "rewards/semantic_correctness_reward_func/std": 0.22224441170692444, "rewards/xmlcount_reward_func/mean": 0.8401250243186951, "rewards/xmlcount_reward_func/std": 0.3684578835964203, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 159.1116180419922, "completions/mean_terminated_length": 147.3710479736328, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.06487132431394306, "grad_norm": 0.026655780151486397, "kl": 0.013226747512817383, "learning_rate": 9.572199695899522e-06, "loss": 0.0077, "num_tokens": 67551205.0, "reward": 0.41442981362342834, "reward_std": 0.05420489236712456, "rewards/gemini_judge_reward_func/mean": 0.0859375, "rewards/gemini_judge_reward_func/std": 0.19272884726524353, "rewards/semantic_correctness_reward_func/mean": 0.4345238506793976, "rewards/semantic_correctness_reward_func/std": 0.1913345605134964, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 156.5803680419922, "completions/mean_terminated_length": 144.80543518066406, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.06521275233664803, "grad_norm": 0.022954002022743225, "kl": 0.013891220092773438, "learning_rate": 9.565482757680415e-06, "loss": -0.0215, "num_tokens": 67917511.0, "reward": 0.4133635461330414, "reward_std": 0.07708757370710373, "rewards/gemini_judge_reward_func/mean": 0.1160714253783226, "rewards/gemini_judge_reward_func/std": 0.2281446009874344, "rewards/semantic_correctness_reward_func/mean": 0.44113001227378845, "rewards/semantic_correctness_reward_func/std": 0.18605509400367737, "rewards/xmlcount_reward_func/mean": 0.6967723965644836, "rewards/xmlcount_reward_func/std": 0.5375342965126038, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 152.76339721679688, "completions/mean_terminated_length": 140.93666076660156, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.065554180359353, "grad_norm": 0.02197893150150776, "kl": 0.013241052627563477, "learning_rate": 9.558715892073324e-06, "loss": -0.0152, "num_tokens": 68291746.0, "reward": 0.41031619906425476, "reward_std": 0.0750560611486435, "rewards/gemini_judge_reward_func/mean": 0.1361607164144516, "rewards/gemini_judge_reward_func/std": 0.26340386271476746, "rewards/semantic_correctness_reward_func/mean": 0.43535763025283813, "rewards/semantic_correctness_reward_func/std": 0.2265578806400299, "rewards/xmlcount_reward_func/mean": 0.6719509363174438, "rewards/xmlcount_reward_func/std": 0.4701566696166992, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 155.22769165039062, "completions/mean_terminated_length": 143.4344024658203, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.06589560838205796, "grad_norm": 0.02204836718738079, "kl": 0.015254497528076172, "learning_rate": 9.551899173079607e-06, "loss": 0.0193, "num_tokens": 68675557.0, "reward": 0.4088119864463806, "reward_std": 0.06842758506536484, "rewards/gemini_judge_reward_func/mean": 0.1462053507566452, "rewards/gemini_judge_reward_func/std": 0.28724196553230286, "rewards/semantic_correctness_reward_func/mean": 0.46464914083480835, "rewards/semantic_correctness_reward_func/std": 0.21852509677410126, "rewards/xmlcount_reward_func/mean": 0.643500030040741, "rewards/xmlcount_reward_func/std": 0.48071083426475525, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 164.80804443359375, "completions/mean_terminated_length": 153.14480590820312, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.06623703640476292, "grad_norm": 0.021376753225922585, "kl": 0.011332511901855469, "learning_rate": 9.545032675245814e-06, "loss": -0.0086, "num_tokens": 69038770.0, "reward": 0.47529637813568115, "reward_std": 0.0824359580874443, "rewards/gemini_judge_reward_func/mean": 0.1506696492433548, "rewards/gemini_judge_reward_func/std": 0.26871559023857117, "rewards/semantic_correctness_reward_func/mean": 0.4306424558162689, "rewards/semantic_correctness_reward_func/std": 0.21526159346103668, "rewards/xmlcount_reward_func/mean": 0.8222500681877136, "rewards/xmlcount_reward_func/std": 0.3842346966266632, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 169.97769165039062, "completions/mean_terminated_length": 154.4499969482422, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.06657846442746788, "grad_norm": 0.021403852850198746, "kl": 0.012527227401733398, "learning_rate": 9.538116473662862e-06, "loss": -0.0267, "num_tokens": 69391789.0, "reward": 0.3966446816921234, "reward_std": 0.046989116817712784, "rewards/gemini_judge_reward_func/mean": 0.0970982164144516, "rewards/gemini_judge_reward_func/std": 0.21387535333633423, "rewards/semantic_correctness_reward_func/mean": 0.3947768807411194, "rewards/semantic_correctness_reward_func/std": 0.1843794733285904, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903594970703, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 151.91519165039062, "completions/mean_terminated_length": 148.00448608398438, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.06691989245017285, "grad_norm": 0.022233830764889717, "kl": 0.012385845184326172, "learning_rate": 9.531150643965224e-06, "loss": -0.0391, "num_tokens": 69735454.0, "reward": 0.45047202706336975, "reward_std": 0.0631929263472557, "rewards/gemini_judge_reward_func/mean": 0.1350446492433548, "rewards/gemini_judge_reward_func/std": 0.21152234077453613, "rewards/semantic_correctness_reward_func/mean": 0.42714568972587585, "rewards/semantic_correctness_reward_func/std": 0.1992858350276947, "rewards/xmlcount_reward_func/mean": 0.7775625586509705, "rewards/xmlcount_reward_func/std": 0.4177508056163788, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 163.46429443359375, "completions/mean_terminated_length": 147.81817626953125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0672613204728778, "grad_norm": 0.02109030820429325, "kl": 0.013692617416381836, "learning_rate": 9.524135262330098e-06, "loss": -0.0261, "num_tokens": 70092946.0, "reward": 0.4639114439487457, "reward_std": 0.08710245043039322, "rewards/gemini_judge_reward_func/mean": 0.1651785671710968, "rewards/gemini_judge_reward_func/std": 0.2721233069896698, "rewards/semantic_correctness_reward_func/mean": 0.4921642243862152, "rewards/semantic_correctness_reward_func/std": 0.19522406160831451, "rewards/xmlcount_reward_func/mean": 0.7485179305076599, "rewards/xmlcount_reward_func/std": 0.4344094395637512, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 155.98214721679688, "completions/mean_terminated_length": 140.1999969482422, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.06760274849558277, "grad_norm": 0.02615966647863388, "kl": 0.017928600311279297, "learning_rate": 9.517070405476575e-06, "loss": -0.0153, "num_tokens": 70460834.0, "reward": 0.41672831773757935, "reward_std": 0.06112901121377945, "rewards/gemini_judge_reward_func/mean": 0.1417410671710968, "rewards/gemini_judge_reward_func/std": 0.23377598822116852, "rewards/semantic_correctness_reward_func/mean": 0.44165927171707153, "rewards/semantic_correctness_reward_func/std": 0.20446263253688812, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 166.02232360839844, "completions/mean_terminated_length": 154.37557983398438, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.06794417651828774, "grad_norm": 0.021424876525998116, "kl": 0.014705181121826172, "learning_rate": 9.509956150664796e-06, "loss": -0.002, "num_tokens": 70813555.0, "reward": 0.4322836995124817, "reward_std": 0.06650731712579727, "rewards/gemini_judge_reward_func/mean": 0.1116071417927742, "rewards/gemini_judge_reward_func/std": 0.22669215500354767, "rewards/semantic_correctness_reward_func/mean": 0.4367040693759918, "rewards/semantic_correctness_reward_func/std": 0.20320431888103485, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 173.4553680419922, "completions/mean_terminated_length": 161.90951538085938, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.06828560454099271, "grad_norm": 0.02211941033601761, "kl": 0.014380931854248047, "learning_rate": 9.502792575695112e-06, "loss": -0.0222, "num_tokens": 71159069.0, "reward": 0.4176250398159027, "reward_std": 0.05642692372202873, "rewards/gemini_judge_reward_func/mean": 0.1160714253783226, "rewards/gemini_judge_reward_func/std": 0.2219172716140747, "rewards/semantic_correctness_reward_func/mean": 0.425982266664505, "rewards/semantic_correctness_reward_func/std": 0.18948838114738464, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 158.24554443359375, "completions/mean_terminated_length": 158.24554443359375, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.06862703256369766, "grad_norm": 0.021084995940327644, "kl": 0.012256860733032227, "learning_rate": 9.495579758907231e-06, "loss": -0.0282, "num_tokens": 71504732.0, "reward": 0.47500061988830566, "reward_std": 0.06313009560108185, "rewards/gemini_judge_reward_func/mean": 0.1328125, "rewards/gemini_judge_reward_func/std": 0.22941070795059204, "rewards/semantic_correctness_reward_func/mean": 0.4291277825832367, "rewards/semantic_correctness_reward_func/std": 0.20796488225460052, "rewards/xmlcount_reward_func/mean": 0.8401250839233398, "rewards/xmlcount_reward_func/std": 0.3684578537940979, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 164.35714721679688, "completions/mean_terminated_length": 152.6877899169922, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.06896846058640263, "grad_norm": 0.02392446994781494, "kl": 0.014485836029052734, "learning_rate": 9.48831777917936e-06, "loss": -0.0129, "num_tokens": 71864228.0, "reward": 0.458452969789505, "reward_std": 0.0712665319442749, "rewards/gemini_judge_reward_func/mean": 0.1238839253783226, "rewards/gemini_judge_reward_func/std": 0.23075933754444122, "rewards/semantic_correctness_reward_func/mean": 0.44468429684638977, "rewards/semantic_correctness_reward_func/std": 0.21000932157039642, "rewards/xmlcount_reward_func/mean": 0.799906313419342, "rewards/xmlcount_reward_func/std": 0.40196701884269714, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 155.2232208251953, "completions/mean_terminated_length": 151.32736206054688, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0693098886091076, "grad_norm": 0.021716050803661346, "kl": 0.014530658721923828, "learning_rate": 9.481006715927352e-06, "loss": -0.0081, "num_tokens": 72209762.0, "reward": 0.43672648072242737, "reward_std": 0.051114633679389954, "rewards/gemini_judge_reward_func/mean": 0.1104910746216774, "rewards/gemini_judge_reward_func/std": 0.24901461601257324, "rewards/semantic_correctness_reward_func/mean": 0.42540010809898376, "rewards/semantic_correctness_reward_func/std": 0.20755036175251007, "rewards/xmlcount_reward_func/mean": 0.7686249613761902, "rewards/xmlcount_reward_func/std": 0.42356839776039124, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 164.58482360839844, "completions/mean_terminated_length": 148.95909118652344, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.06965131663181255, "grad_norm": 0.023567847907543182, "kl": 0.0158233642578125, "learning_rate": 9.473646649103819e-06, "loss": -0.0073, "num_tokens": 72559321.0, "reward": 0.3971547782421112, "reward_std": 0.06843477487564087, "rewards/gemini_judge_reward_func/mean": 0.0848214253783226, "rewards/gemini_judge_reward_func/std": 0.19823653995990753, "rewards/semantic_correctness_reward_func/mean": 0.37718465924263, "rewards/semantic_correctness_reward_func/std": 0.20006071031093597, "rewards/xmlcount_reward_func/mean": 0.7194733023643494, "rewards/xmlcount_reward_func/std": 0.44856736063957214, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 165.51339721679688, "completions/mean_terminated_length": 157.77928161621094, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.06999274465451752, "grad_norm": 0.02119120955467224, "kl": 0.014370441436767578, "learning_rate": 9.466237659197271e-06, "loss": 0.0142, "num_tokens": 72915740.0, "reward": 0.4165930151939392, "reward_std": 0.05669743940234184, "rewards/gemini_judge_reward_func/mean": 0.1071428582072258, "rewards/gemini_judge_reward_func/std": 0.22389310598373413, "rewards/semantic_correctness_reward_func/mean": 0.40292924642562866, "rewards/semantic_correctness_reward_func/std": 0.19625583291053772, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 161.34375, "completions/mean_terminated_length": 145.65908813476562, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.07033417267722249, "grad_norm": 0.02108367159962654, "kl": 0.017071247100830078, "learning_rate": 9.458779827231237e-06, "loss": -0.0316, "num_tokens": 73288101.0, "reward": 0.4331299364566803, "reward_std": 0.06814208626747131, "rewards/gemini_judge_reward_func/mean": 0.1607142835855484, "rewards/gemini_judge_reward_func/std": 0.28579434752464294, "rewards/semantic_correctness_reward_func/mean": 0.44997096061706543, "rewards/semantic_correctness_reward_func/std": 0.19737227261066437, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903594970703, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 172.92857360839844, "completions/mean_terminated_length": 149.50457763671875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.07067560069992744, "grad_norm": 0.023088613525032997, "kl": 0.015703678131103516, "learning_rate": 9.451273234763372e-06, "loss": 0.0331, "num_tokens": 73661089.0, "reward": 0.4168475568294525, "reward_std": 0.05705378204584122, "rewards/gemini_judge_reward_func/mean": 0.1350446492433548, "rewards/gemini_judge_reward_func/std": 0.27490234375, "rewards/semantic_correctness_reward_func/mean": 0.4198981821537018, "rewards/semantic_correctness_reward_func/std": 0.19315725564956665, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903892993927, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 161.10714721679688, "completions/mean_terminated_length": 157.2376708984375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.07101702872263241, "grad_norm": 0.022637007758021355, "kl": 0.01239013671875, "learning_rate": 9.443717963884568e-06, "loss": -0.0349, "num_tokens": 74021557.0, "reward": 0.4410480856895447, "reward_std": 0.06117826700210571, "rewards/gemini_judge_reward_func/mean": 0.1439732164144516, "rewards/gemini_judge_reward_func/std": 0.2487129420042038, "rewards/semantic_correctness_reward_func/mean": 0.41579389572143555, "rewards/semantic_correctness_reward_func/std": 0.1910022795200348, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 163.0982208251953, "completions/mean_terminated_length": 155.34234619140625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.07135845674533738, "grad_norm": 0.021986160427331924, "kl": 0.01541757583618164, "learning_rate": 9.43611409721806e-06, "loss": -0.0005, "num_tokens": 74362631.0, "reward": 0.4239736497402191, "reward_std": 0.053107425570487976, "rewards/gemini_judge_reward_func/mean": 0.1149553582072258, "rewards/gemini_judge_reward_func/std": 0.249235600233078, "rewards/semantic_correctness_reward_func/mean": 0.38845744729042053, "rewards/semantic_correctness_reward_func/std": 0.2081860601902008, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 182.16964721679688, "completions/mean_terminated_length": 159.0, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.07169988476804234, "grad_norm": 0.020678477361798286, "kl": 0.01442861557006836, "learning_rate": 9.428461717918512e-06, "loss": -0.0257, "num_tokens": 74725897.0, "reward": 0.4402584433555603, "reward_std": 0.06913614273071289, "rewards/gemini_judge_reward_func/mean": 0.140625, "rewards/gemini_judge_reward_func/std": 0.24093718826770782, "rewards/semantic_correctness_reward_func/mean": 0.45429208874702454, "rewards/semantic_correctness_reward_func/std": 0.22150787711143494, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 156.3125, "completions/mean_terminated_length": 144.533935546875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.0720413127907473, "grad_norm": 0.02135198749601841, "kl": 0.014465570449829102, "learning_rate": 9.420760909671119e-06, "loss": -0.0045, "num_tokens": 75080187.0, "reward": 0.3945431113243103, "reward_std": 0.060559362173080444, "rewards/gemini_judge_reward_func/mean": 0.1205357164144516, "rewards/gemini_judge_reward_func/std": 0.26146823167800903, "rewards/semantic_correctness_reward_func/mean": 0.408893883228302, "rewards/semantic_correctness_reward_func/std": 0.1875745952129364, "rewards/xmlcount_reward_func/mean": 0.6613750457763672, "rewards/xmlcount_reward_func/std": 0.47500187158584595, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 164.40179443359375, "completions/mean_terminated_length": 144.7762451171875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.07238274081345226, "grad_norm": 0.02135198749601841, "kl": 0.01694631576538086, "learning_rate": 9.420760909671119e-06, "loss": -0.0253, "num_tokens": 75444185.0, "reward": 0.4022166430950165, "reward_std": 0.05362841486930847, "rewards/gemini_judge_reward_func/mean": 0.1361607164144516, "rewards/gemini_judge_reward_func/std": 0.23767462372779846, "rewards/semantic_correctness_reward_func/mean": 0.4517615735530853, "rewards/semantic_correctness_reward_func/std": 0.20318275690078735, "rewards/xmlcount_reward_func/mean": 0.643500030040741, "rewards/xmlcount_reward_func/std": 0.48071083426475525, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 815.0, "completions/mean_length": 159.13839721679688, "completions/mean_terminated_length": 147.398193359375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.07272416883615723, "grad_norm": 0.021344488486647606, "kl": 0.015068531036376953, "learning_rate": 9.413011756690686e-06, "loss": -0.0179, "num_tokens": 75804152.0, "reward": 0.4192578196525574, "reward_std": 0.056778065860271454, "rewards/gemini_judge_reward_func/mean": 0.1037946417927742, "rewards/gemini_judge_reward_func/std": 0.22484947741031647, "rewards/semantic_correctness_reward_func/mean": 0.42294973134994507, "rewards/semantic_correctness_reward_func/std": 0.22244691848754883, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 144.6741180419922, "completions/mean_terminated_length": 144.6741180419922, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.07306559685886219, "grad_norm": 0.02157749980688095, "kl": 0.014754772186279297, "learning_rate": 9.405214343720708e-06, "loss": -0.0117, "num_tokens": 76156359.0, "reward": 0.42160260677337646, "reward_std": 0.054873332381248474, "rewards/gemini_judge_reward_func/mean": 0.0848214253783226, "rewards/gemini_judge_reward_func/std": 0.19249825179576874, "rewards/semantic_correctness_reward_func/mean": 0.4081380069255829, "rewards/semantic_correctness_reward_func/std": 0.1861819475889206, "rewards/xmlcount_reward_func/mean": 0.7651161551475525, "rewards/xmlcount_reward_func/std": 0.4228045344352722, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 170.70089721679688, "completions/mean_terminated_length": 155.1863555908203, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.07340702488156715, "grad_norm": 0.02151479572057724, "kl": 0.014784812927246094, "learning_rate": 9.397368756032445e-06, "loss": -0.0108, "num_tokens": 76514364.0, "reward": 0.42958998680114746, "reward_std": 0.058609623461961746, "rewards/gemini_judge_reward_func/mean": 0.1160714253783226, "rewards/gemini_judge_reward_func/std": 0.2244289368391037, "rewards/semantic_correctness_reward_func/mean": 0.41548532247543335, "rewards/semantic_correctness_reward_func/std": 0.2050415575504303, "rewards/xmlcount_reward_func/mean": 0.7501607537269592, "rewards/xmlcount_reward_func/std": 0.4322551190853119, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 160.38394165039062, "completions/mean_terminated_length": 156.51121520996094, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.07374845290427212, "grad_norm": 0.02160024829208851, "kl": 0.015204429626464844, "learning_rate": 9.389475079423988e-06, "loss": 0.0052, "num_tokens": 76871318.0, "reward": 0.40496453642845154, "reward_std": 0.06893553584814072, "rewards/gemini_judge_reward_func/mean": 0.1339285671710968, "rewards/gemini_judge_reward_func/std": 0.25151684880256653, "rewards/semantic_correctness_reward_func/mean": 0.4342152774333954, "rewards/semantic_correctness_reward_func/std": 0.20986708998680115, "rewards/xmlcount_reward_func/mean": 0.6613750457763672, "rewards/xmlcount_reward_func/std": 0.47500187158584595, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 152.30804443359375, "completions/mean_terminated_length": 148.39910888671875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.07408988092697708, "grad_norm": 0.02134779281914234, "kl": 0.01924419403076172, "learning_rate": 9.381533400219319e-06, "loss": 0.0234, "num_tokens": 77235119.0, "reward": 0.47121262550354004, "reward_std": 0.07921778410673141, "rewards/gemini_judge_reward_func/mean": 0.1674107164144516, "rewards/gemini_judge_reward_func/std": 0.27890798449516296, "rewards/semantic_correctness_reward_func/mean": 0.4839913547039032, "rewards/semantic_correctness_reward_func/std": 0.22796045243740082, "rewards/xmlcount_reward_func/mean": 0.7686249613761902, "rewards/xmlcount_reward_func/std": 0.42356839776039124, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 156.70089721679688, "completions/mean_terminated_length": 152.81166076660156, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.07443130894968204, "grad_norm": 0.021481551229953766, "kl": 0.013187408447265625, "learning_rate": 9.373543805267367e-06, "loss": -0.0126, "num_tokens": 77561024.0, "reward": 0.4687114655971527, "reward_std": 0.06550662219524384, "rewards/gemini_judge_reward_func/mean": 0.1227678582072258, "rewards/gemini_judge_reward_func/std": 0.21306942403316498, "rewards/semantic_correctness_reward_func/mean": 0.4535214602947235, "rewards/semantic_correctness_reward_func/std": 0.21218258142471313, "rewards/xmlcount_reward_func/mean": 0.8222500681877136, "rewards/xmlcount_reward_func/std": 0.3842346966266632, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 170.36607360839844, "completions/mean_terminated_length": 150.876708984375, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.07477273697238701, "grad_norm": 0.020490756258368492, "kl": 0.013939857482910156, "learning_rate": 9.365506381941066e-06, "loss": 0.0014, "num_tokens": 77917414.0, "reward": 0.4274297058582306, "reward_std": 0.06760236620903015, "rewards/gemini_judge_reward_func/mean": 0.1205357164144516, "rewards/gemini_judge_reward_func/std": 0.2560526132583618, "rewards/semantic_correctness_reward_func/mean": 0.44821077585220337, "rewards/semantic_correctness_reward_func/std": 0.21363292634487152, "rewards/xmlcount_reward_func/mean": 0.7239331007003784, "rewards/xmlcount_reward_func/std": 0.4488573372364044, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 163.45982360839844, "completions/mean_terminated_length": 147.81362915039062, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.07511416499509198, "grad_norm": 0.021693740040063858, "kl": 0.016297340393066406, "learning_rate": 9.357421218136387e-06, "loss": -0.023, "num_tokens": 78286709.0, "reward": 0.4039275050163269, "reward_std": 0.06309369206428528, "rewards/gemini_judge_reward_func/mean": 0.1026785746216774, "rewards/gemini_judge_reward_func/std": 0.22474093735218048, "rewards/semantic_correctness_reward_func/mean": 0.411083847284317, "rewards/semantic_correctness_reward_func/std": 0.20537878572940826, "rewards/xmlcount_reward_func/mean": 0.7015982270240784, "rewards/xmlcount_reward_func/std": 0.4568972587585449, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 161.10269165039062, "completions/mean_terminated_length": 149.38914489746094, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.07545559301779693, "grad_norm": 0.021494340151548386, "kl": 0.014636516571044922, "learning_rate": 9.349288402271387e-06, "loss": 0.0103, "num_tokens": 78654304.0, "reward": 0.40833523869514465, "reward_std": 0.06033541262149811, "rewards/gemini_judge_reward_func/mean": 0.1194196417927742, "rewards/gemini_judge_reward_func/std": 0.24483934044837952, "rewards/semantic_correctness_reward_func/mean": 0.4185151755809784, "rewards/semantic_correctness_reward_func/std": 0.18770398199558258, "rewards/xmlcount_reward_func/mean": 0.6921607255935669, "rewards/xmlcount_reward_func/std": 0.46012961864471436, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 163.02679443359375, "completions/mean_terminated_length": 155.27027893066406, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.0757970210405019, "grad_norm": 0.021399740129709244, "kl": 0.015057563781738281, "learning_rate": 9.341108023285239e-06, "loss": -0.0114, "num_tokens": 79000814.0, "reward": 0.45225730538368225, "reward_std": 0.07958463579416275, "rewards/gemini_judge_reward_func/mean": 0.1506696492433548, "rewards/gemini_judge_reward_func/std": 0.25258663296699524, "rewards/semantic_correctness_reward_func/mean": 0.43167024850845337, "rewards/semantic_correctness_reward_func/std": 0.20277100801467896, "rewards/xmlcount_reward_func/mean": 0.764138400554657, "rewards/xmlcount_reward_func/std": 0.42636701464653015, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 152.04019165039062, "completions/mean_terminated_length": 148.1300506591797, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.07613844906320687, "grad_norm": 0.023108750581741333, "kl": 0.013528823852539062, "learning_rate": 9.332880170637252e-06, "loss": -0.019, "num_tokens": 79331523.0, "reward": 0.412973016500473, "reward_std": 0.04944463074207306, "rewards/gemini_judge_reward_func/mean": 0.0948660746216774, "rewards/gemini_judge_reward_func/std": 0.18995627760887146, "rewards/semantic_correctness_reward_func/mean": 0.4093826711177826, "rewards/semantic_correctness_reward_func/std": 0.19295699894428253, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 162.71429443359375, "completions/mean_terminated_length": 147.05453491210938, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.07647987708591182, "grad_norm": 0.0203098151832819, "kl": 0.015305519104003906, "learning_rate": 9.324604934305911e-06, "loss": 0.0189, "num_tokens": 79697987.0, "reward": 0.40999144315719604, "reward_std": 0.05599633976817131, "rewards/gemini_judge_reward_func/mean": 0.0870535746216774, "rewards/gemini_judge_reward_func/std": 0.18250198662281036, "rewards/semantic_correctness_reward_func/mean": 0.4190196692943573, "rewards/semantic_correctness_reward_func/std": 0.19571587443351746, "rewards/xmlcount_reward_func/mean": 0.7284151911735535, "rewards/xmlcount_reward_func/std": 0.438982218503952, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 154.38394165039062, "completions/mean_terminated_length": 146.549560546875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.07682130510861679, "grad_norm": 0.02163609303534031, "kl": 0.01602935791015625, "learning_rate": 9.31628240478787e-06, "loss": -0.0308, "num_tokens": 80051057.0, "reward": 0.43514472246170044, "reward_std": 0.06055814400315285, "rewards/gemini_judge_reward_func/mean": 0.1272321492433548, "rewards/gemini_judge_reward_func/std": 0.25388360023498535, "rewards/semantic_correctness_reward_func/mean": 0.41975903511047363, "rewards/semantic_correctness_reward_func/std": 0.186967134475708, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 184.8303680419922, "completions/mean_terminated_length": 165.6712188720703, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.07716273313132176, "grad_norm": 0.02016390860080719, "kl": 0.016954421997070312, "learning_rate": 9.30791267309698e-06, "loss": -0.0333, "num_tokens": 80438571.0, "reward": 0.4161798655986786, "reward_std": 0.058609530329704285, "rewards/gemini_judge_reward_func/mean": 0.1171875, "rewards/gemini_judge_reward_func/std": 0.2168499380350113, "rewards/semantic_correctness_reward_func/mean": 0.461211621761322, "rewards/semantic_correctness_reward_func/std": 0.2113778293132782, "rewards/xmlcount_reward_func/mean": 0.6926562190055847, "rewards/xmlcount_reward_func/std": 0.4631781280040741, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 165.24554443359375, "completions/mean_terminated_length": 145.63926696777344, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.07750416115402671, "grad_norm": 0.02275063470005989, "kl": 0.015069961547851562, "learning_rate": 9.299495830763285e-06, "loss": -0.0236, "num_tokens": 80815254.0, "reward": 0.42212775349617004, "reward_std": 0.074894979596138, "rewards/gemini_judge_reward_func/mean": 0.1607142835855484, "rewards/gemini_judge_reward_func/std": 0.2877489924430847, "rewards/semantic_correctness_reward_func/mean": 0.4664600193500519, "rewards/semantic_correctness_reward_func/std": 0.22998718917369843, "rewards/xmlcount_reward_func/mean": 0.6613750457763672, "rewards/xmlcount_reward_func/std": 0.47500184178352356, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 171.2678680419922, "completions/mean_terminated_length": 155.7636260986328, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.07784558917673168, "grad_norm": 0.02133053168654442, "kl": 0.01563549041748047, "learning_rate": 9.291031969832026e-06, "loss": -0.0118, "num_tokens": 81154166.0, "reward": 0.4396763741970062, "reward_std": 0.0586925633251667, "rewards/gemini_judge_reward_func/mean": 0.0837053582072258, "rewards/gemini_judge_reward_func/std": 0.1878366768360138, "rewards/semantic_correctness_reward_func/mean": 0.42222103476524353, "rewards/semantic_correctness_reward_func/std": 0.2082173377275467, "rewards/xmlcount_reward_func/mean": 0.8043751120567322, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 160.46429443359375, "completions/mean_terminated_length": 156.59193420410156, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.07818701719943665, "grad_norm": 0.020135052502155304, "kl": 0.014910221099853516, "learning_rate": 9.28252118286263e-06, "loss": -0.0066, "num_tokens": 81518406.0, "reward": 0.3965265154838562, "reward_std": 0.05617070570588112, "rewards/gemini_judge_reward_func/mean": 0.1138392835855484, "rewards/gemini_judge_reward_func/std": 0.22432857751846313, "rewards/semantic_correctness_reward_func/mean": 0.43941816687583923, "rewards/semantic_correctness_reward_func/std": 0.19124464690685272, "rewards/xmlcount_reward_func/mean": 0.6577678322792053, "rewards/xmlcount_reward_func/std": 0.47394242882728577, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 168.90179443359375, "completions/mean_terminated_length": 149.3789825439453, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.07852844522214161, "grad_norm": 0.021697277203202248, "kl": 0.013443470001220703, "learning_rate": 9.273963562927695e-06, "loss": 0.007, "num_tokens": 81881684.0, "reward": 0.44313669204711914, "reward_std": 0.06279011070728302, "rewards/gemini_judge_reward_func/mean": 0.1316964328289032, "rewards/gemini_judge_reward_func/std": 0.2482219636440277, "rewards/semantic_correctness_reward_func/mean": 0.4566476047039032, "rewards/semantic_correctness_reward_func/std": 0.18283917009830475, "rewards/xmlcount_reward_func/mean": 0.7478214502334595, "rewards/xmlcount_reward_func/std": 0.43387457728385925, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 167.49554443359375, "completions/mean_terminated_length": 147.9406280517578, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.07886987324484657, "grad_norm": 0.020851455628871918, "kl": 0.014322280883789062, "learning_rate": 9.265359203611988e-06, "loss": -0.0232, "num_tokens": 82245263.0, "reward": 0.4470902383327484, "reward_std": 0.06255059689283371, "rewards/gemini_judge_reward_func/mean": 0.1127232164144516, "rewards/gemini_judge_reward_func/std": 0.220489040017128, "rewards/semantic_correctness_reward_func/mean": 0.43700453639030457, "rewards/semantic_correctness_reward_func/std": 0.2078365534543991, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 175.83482360839844, "completions/mean_terminated_length": 152.49081420898438, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.07921130126755153, "grad_norm": 0.02140904776751995, "kl": 0.020357608795166016, "learning_rate": 9.256708199011402e-06, "loss": -0.0233, "num_tokens": 82588374.0, "reward": 0.38504737615585327, "reward_std": 0.07159604132175446, "rewards/gemini_judge_reward_func/mean": 0.1060267835855484, "rewards/gemini_judge_reward_func/std": 0.2383553832769394, "rewards/semantic_correctness_reward_func/mean": 0.3775940239429474, "rewards/semantic_correctness_reward_func/std": 0.18888156116008759, "rewards/xmlcount_reward_func/mean": 0.6677946448326111, "rewards/xmlcount_reward_func/std": 0.468395471572876, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 159.8169708251953, "completions/mean_terminated_length": 144.1045379638672, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.0795527292902565, "grad_norm": 0.021409912034869194, "kl": 0.015997886657714844, "learning_rate": 9.248010643731936e-06, "loss": -0.0371, "num_tokens": 82966821.0, "reward": 0.40062811970710754, "reward_std": 0.0587112195789814, "rewards/gemini_judge_reward_func/mean": 0.09375, "rewards/gemini_judge_reward_func/std": 0.2012433260679245, "rewards/semantic_correctness_reward_func/mean": 0.411104679107666, "rewards/semantic_correctness_reward_func/std": 0.20091889798641205, "rewards/xmlcount_reward_func/mean": 0.7022679448127747, "rewards/xmlcount_reward_func/std": 0.45671790838241577, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 165.28125, "completions/mean_terminated_length": 145.67579650878906, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.07989415731296146, "grad_norm": 0.02124546654522419, "kl": 0.014566898345947266, "learning_rate": 9.23926663288866e-06, "loss": -0.0139, "num_tokens": 83334656.0, "reward": 0.39910420775413513, "reward_std": 0.06146840751171112, "rewards/gemini_judge_reward_func/mean": 0.1238839253783226, "rewards/gemini_judge_reward_func/std": 0.2538909912109375, "rewards/semantic_correctness_reward_func/mean": 0.43394067883491516, "rewards/semantic_correctness_reward_func/std": 0.226360023021698, "rewards/xmlcount_reward_func/mean": 0.656906247138977, "rewards/xmlcount_reward_func/std": 0.47649866342544556, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 727.0, "completions/mean_length": 151.41519165039062, "completions/mean_terminated_length": 139.5701446533203, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.08023558533566642, "grad_norm": 0.022864429280161858, "kl": 0.016307353973388672, "learning_rate": 9.230476262104678e-06, "loss": -0.0324, "num_tokens": 83697741.0, "reward": 0.40850719809532166, "reward_std": 0.06750909239053726, "rewards/gemini_judge_reward_func/mean": 0.1294642835855484, "rewards/gemini_judge_reward_func/std": 0.22078678011894226, "rewards/semantic_correctness_reward_func/mean": 0.42064300179481506, "rewards/semantic_correctness_reward_func/std": 0.20097774267196655, "rewards/xmlcount_reward_func/mean": 0.681482195854187, "rewards/xmlcount_reward_func/std": 0.4664749205112457, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0401785714285714, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 185.51339721679688, "completions/mean_terminated_length": 150.41395568847656, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.08057701335837139, "grad_norm": 0.021365147083997726, "kl": 0.013797760009765625, "learning_rate": 9.221639627510076e-06, "loss": -0.0213, "num_tokens": 84090224.0, "reward": 0.3859476149082184, "reward_std": 0.06201120465993881, "rewards/gemini_judge_reward_func/mean": 0.1037946417927742, "rewards/gemini_judge_reward_func/std": 0.21724192798137665, "rewards/semantic_correctness_reward_func/mean": 0.4172736704349518, "rewards/semantic_correctness_reward_func/std": 0.20381608605384827, "rewards/xmlcount_reward_func/mean": 0.6524375677108765, "rewards/xmlcount_reward_func/std": 0.47794878482818604, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 153.1428680419922, "completions/mean_terminated_length": 145.29730224609375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.08091844138107636, "grad_norm": 0.021086620166897774, "kl": 0.014497756958007812, "learning_rate": 9.212756825740874e-06, "loss": -0.0396, "num_tokens": 84447016.0, "reward": 0.434138685464859, "reward_std": 0.07237514853477478, "rewards/gemini_judge_reward_func/mean": 0.1171875, "rewards/gemini_judge_reward_func/std": 0.24015435576438904, "rewards/semantic_correctness_reward_func/mean": 0.3990683853626251, "rewards/semantic_correctness_reward_func/std": 0.18283356726169586, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 163.5625, "completions/mean_terminated_length": 151.88235473632812, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.08125986940378131, "grad_norm": 0.02173583023250103, "kl": 0.01507568359375, "learning_rate": 9.203827953937969e-06, "loss": 0.0123, "num_tokens": 84820046.0, "reward": 0.40177983045578003, "reward_std": 0.06776406615972519, "rewards/gemini_judge_reward_func/mean": 0.1071428582072258, "rewards/gemini_judge_reward_func/std": 0.2162519097328186, "rewards/semantic_correctness_reward_func/mean": 0.40036338567733765, "rewards/semantic_correctness_reward_func/std": 0.22206875681877136, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903892993927, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 163.17857360839844, "completions/mean_terminated_length": 147.5272674560547, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.08160129742648628, "grad_norm": 0.02061956562101841, "kl": 0.019533157348632812, "learning_rate": 9.194853109746073e-06, "loss": -0.0204, "num_tokens": 85190930.0, "reward": 0.354619562625885, "reward_std": 0.0518239289522171, "rewards/gemini_judge_reward_func/mean": 0.0736607164144516, "rewards/gemini_judge_reward_func/std": 0.16450220346450806, "rewards/semantic_correctness_reward_func/mean": 0.3745262324810028, "rewards/semantic_correctness_reward_func/std": 0.1803187131881714, "rewards/xmlcount_reward_func/mean": 0.6256250739097595, "rewards/xmlcount_reward_func/std": 0.48569241166114807, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 163.87054443359375, "completions/mean_terminated_length": 156.1216278076172, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.08194272544919125, "grad_norm": 0.02133871614933014, "kl": 0.0135650634765625, "learning_rate": 9.185832391312644e-06, "loss": -0.0231, "num_tokens": 85531313.0, "reward": 0.4272550642490387, "reward_std": 0.06283900886774063, "rewards/gemini_judge_reward_func/mean": 0.0870535746216774, "rewards/gemini_judge_reward_func/std": 0.20965971052646637, "rewards/semantic_correctness_reward_func/mean": 0.38916799426078796, "rewards/semantic_correctness_reward_func/std": 0.1963030993938446, "rewards/xmlcount_reward_func/mean": 0.7864999771118164, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 163.3616180419922, "completions/mean_terminated_length": 155.6081085205078, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.0822841534718962, "grad_norm": 0.01960454136133194, "kl": 0.013911247253417969, "learning_rate": 9.176765897286812e-06, "loss": -0.0249, "num_tokens": 85890730.0, "reward": 0.4563038647174835, "reward_std": 0.06880877912044525, "rewards/gemini_judge_reward_func/mean": 0.1551339328289032, "rewards/gemini_judge_reward_func/std": 0.2734237015247345, "rewards/semantic_correctness_reward_func/mean": 0.4544209837913513, "rewards/semantic_correctness_reward_func/std": 0.2353294938802719, "rewards/xmlcount_reward_func/mean": 0.7584152221679688, "rewards/xmlcount_reward_func/std": 0.42483606934547424, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 1024.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 178.0982208251953, "completions/mean_terminated_length": 146.76852416992188, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.08262558149460117, "grad_norm": 0.024442723020911217, "kl": 0.017345428466796875, "learning_rate": 9.167653726818305e-06, "loss": -0.009, "num_tokens": 86252752.0, "reward": 0.43084582686424255, "reward_std": 0.07760713994503021, "rewards/gemini_judge_reward_func/mean": 0.1283482164144516, "rewards/gemini_judge_reward_func/std": 0.24142225086688995, "rewards/semantic_correctness_reward_func/mean": 0.4139074683189392, "rewards/semantic_correctness_reward_func/std": 0.230632945895195, "rewards/xmlcount_reward_func/mean": 0.7418125867843628, "rewards/xmlcount_reward_func/std": 0.4394664168357849, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 147.71429443359375, "completions/mean_terminated_length": 139.81982421875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.08296700951730614, "grad_norm": 0.02121170423924923, "kl": 0.014449596405029297, "learning_rate": 9.15849597955636e-06, "loss": -0.0202, "num_tokens": 86610004.0, "reward": 0.4123784005641937, "reward_std": 0.06092296540737152, "rewards/gemini_judge_reward_func/mean": 0.1272321492433548, "rewards/gemini_judge_reward_func/std": 0.2668020725250244, "rewards/semantic_correctness_reward_func/mean": 0.413177490234375, "rewards/semantic_correctness_reward_func/std": 0.19070591032505035, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903594970703, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 792.0, "completions/mean_length": 158.5178680419922, "completions/mean_terminated_length": 150.72071838378906, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.08330843754001109, "grad_norm": 0.02672337368130684, "kl": 0.01831817626953125, "learning_rate": 9.149292755648631e-06, "loss": -0.0392, "num_tokens": 86964124.0, "reward": 0.4503079354763031, "reward_std": 0.08422276377677917, "rewards/gemini_judge_reward_func/mean": 0.1618303507566452, "rewards/gemini_judge_reward_func/std": 0.26949670910835266, "rewards/semantic_correctness_reward_func/mean": 0.4621289074420929, "rewards/semantic_correctness_reward_func/std": 0.21201936900615692, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 172.85269165039062, "completions/mean_terminated_length": 145.3963165283203, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.08364986556271606, "grad_norm": 0.02006138674914837, "kl": 0.01513051986694336, "learning_rate": 9.140044155740102e-06, "loss": 0.0065, "num_tokens": 87345575.0, "reward": 0.36606886982917786, "reward_std": 0.058657389134168625, "rewards/gemini_judge_reward_func/mean": 0.09375, "rewards/gemini_judge_reward_func/std": 0.20809029042720795, "rewards/semantic_correctness_reward_func/mean": 0.42964765429496765, "rewards/semantic_correctness_reward_func/std": 0.19854861497879028, "rewards/xmlcount_reward_func/mean": 0.6065982580184937, "rewards/xmlcount_reward_func/std": 0.4893246293067932, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 151.75894165039062, "completions/mean_terminated_length": 143.90090942382812, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.08399129358542103, "grad_norm": 0.022956129163503647, "kl": 0.01905345916748047, "learning_rate": 9.130750280971978e-06, "loss": -0.0355, "num_tokens": 87729765.0, "reward": 0.392170250415802, "reward_std": 0.06233404949307442, "rewards/gemini_judge_reward_func/mean": 0.1138392835855484, "rewards/gemini_judge_reward_func/std": 0.24464760720729828, "rewards/semantic_correctness_reward_func/mean": 0.3992529511451721, "rewards/semantic_correctness_reward_func/std": 0.21186232566833496, "rewards/xmlcount_reward_func/mean": 0.666959822177887, "rewards/xmlcount_reward_func/std": 0.47216716408729553, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 165.2232208251953, "completions/mean_terminated_length": 161.37220764160156, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.084332721608126, "grad_norm": 0.01989656873047352, "kl": 0.01374053955078125, "learning_rate": 9.121411232980589e-06, "loss": -0.0314, "num_tokens": 88087375.0, "reward": 0.4254680275917053, "reward_std": 0.0688747987151146, "rewards/gemini_judge_reward_func/mean": 0.1328125, "rewards/gemini_judge_reward_func/std": 0.26031482219696045, "rewards/semantic_correctness_reward_func/mean": 0.41387563943862915, "rewards/semantic_correctness_reward_func/std": 0.21328496932983398, "rewards/xmlcount_reward_func/mean": 0.723919689655304, "rewards/xmlcount_reward_func/std": 0.4488491117954254, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 165.77679443359375, "completions/mean_terminated_length": 150.1727294921875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.08467414963083095, "grad_norm": 0.021479859948158264, "kl": 0.015668630599975586, "learning_rate": 9.112027113896262e-06, "loss": -0.0151, "num_tokens": 88441945.0, "reward": 0.4406365156173706, "reward_std": 0.0737806111574173, "rewards/gemini_judge_reward_func/mean": 0.1305803507566452, "rewards/gemini_judge_reward_func/std": 0.24369193613529205, "rewards/semantic_correctness_reward_func/mean": 0.4360485374927521, "rewards/semantic_correctness_reward_func/std": 0.21203678846359253, "rewards/xmlcount_reward_func/mean": 0.7529866099357605, "rewards/xmlcount_reward_func/std": 0.4318158030509949, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 170.93304443359375, "completions/mean_terminated_length": 151.4566192626953, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.08501557765353591, "grad_norm": 0.020318983122706413, "kl": 0.015210866928100586, "learning_rate": 9.102598026342223e-06, "loss": -0.0411, "num_tokens": 88798298.0, "reward": 0.4054628312587738, "reward_std": 0.06705626100301743, "rewards/gemini_judge_reward_func/mean": 0.1104910746216774, "rewards/gemini_judge_reward_func/std": 0.24100728332996368, "rewards/semantic_correctness_reward_func/mean": 0.37633195519447327, "rewards/semantic_correctness_reward_func/std": 0.21562041342258453, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 162.27679443359375, "completions/mean_terminated_length": 150.57919311523438, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.08535700567624088, "grad_norm": 0.02207431197166443, "kl": 0.01397848129272461, "learning_rate": 9.093124073433464e-06, "loss": -0.0073, "num_tokens": 89144892.0, "reward": 0.42173489928245544, "reward_std": 0.053448598831892014, "rewards/gemini_judge_reward_func/mean": 0.109375, "rewards/gemini_judge_reward_func/std": 0.21771657466888428, "rewards/semantic_correctness_reward_func/mean": 0.42417430877685547, "rewards/semantic_correctness_reward_func/std": 0.19368620216846466, "rewards/xmlcount_reward_func/mean": 0.7328750491142273, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 158.2366180419922, "completions/mean_terminated_length": 154.35426330566406, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.08569843369894584, "grad_norm": 0.021455751731991768, "kl": 0.015799522399902344, "learning_rate": 9.083605358775612e-06, "loss": -0.0366, "num_tokens": 89481561.0, "reward": 0.44944095611572266, "reward_std": 0.06190529465675354, "rewards/gemini_judge_reward_func/mean": 0.1104910746216774, "rewards/gemini_judge_reward_func/std": 0.22538302838802338, "rewards/semantic_correctness_reward_func/mean": 0.4174725115299225, "rewards/semantic_correctness_reward_func/std": 0.19083231687545776, "rewards/xmlcount_reward_func/mean": 0.8043750524520874, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 175.04464721679688, "completions/mean_terminated_length": 151.67889404296875, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.0860398617216508, "grad_norm": 0.021432137116789818, "kl": 0.015732288360595703, "learning_rate": 9.074041986463808e-06, "loss": -0.0306, "num_tokens": 89851651.0, "reward": 0.40240949392318726, "reward_std": 0.06158284842967987, "rewards/gemini_judge_reward_func/mean": 0.1205357164144516, "rewards/gemini_judge_reward_func/std": 0.2527475655078888, "rewards/semantic_correctness_reward_func/mean": 0.3960294723510742, "rewards/semantic_correctness_reward_func/std": 0.21675321459770203, "rewards/xmlcount_reward_func/mean": 0.6874732971191406, "rewards/xmlcount_reward_func/std": 0.46465516090393066, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 174.13839721679688, "completions/mean_terminated_length": 150.74769592285156, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.08638128974435577, "grad_norm": 0.019777290523052216, "kl": 0.015145301818847656, "learning_rate": 9.064434061081562e-06, "loss": -0.0371, "num_tokens": 90212102.0, "reward": 0.40325623750686646, "reward_std": 0.04822305217385292, "rewards/gemini_judge_reward_func/mean": 0.1049107164144516, "rewards/gemini_judge_reward_func/std": 0.24060459434986115, "rewards/semantic_correctness_reward_func/mean": 0.4479595124721527, "rewards/semantic_correctness_reward_func/std": 0.20428718626499176, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 165.60714721679688, "completions/mean_terminated_length": 157.87387084960938, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.08672271776706073, "grad_norm": 0.021202484145760536, "kl": 0.012971878051757812, "learning_rate": 9.0547816876996e-06, "loss": -0.0236, "num_tokens": 90555198.0, "reward": 0.4358048439025879, "reward_std": 0.06684070080518723, "rewards/gemini_judge_reward_func/mean": 0.1149553582072258, "rewards/gemini_judge_reward_func/std": 0.22562715411186218, "rewards/semantic_correctness_reward_func/mean": 0.4118633270263672, "rewards/semantic_correctness_reward_func/std": 0.22267456352710724, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 171.31251525878906, "completions/mean_terminated_length": 155.80908203125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.08706414578976569, "grad_norm": 0.02179318107664585, "kl": 0.012566328048706055, "learning_rate": 9.045084971874738e-06, "loss": -0.0163, "num_tokens": 90887740.0, "reward": 0.4795321226119995, "reward_std": 0.07366758584976196, "rewards/gemini_judge_reward_func/mean": 0.1484375, "rewards/gemini_judge_reward_func/std": 0.25720080733299255, "rewards/semantic_correctness_reward_func/mean": 0.4562854766845703, "rewards/semantic_correctness_reward_func/std": 0.21975703537464142, "rewards/xmlcount_reward_func/mean": 0.8222500085830688, "rewards/xmlcount_reward_func/std": 0.3842346966266632, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 168.32589721679688, "completions/mean_terminated_length": 160.61712646484375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.08740557381247066, "grad_norm": 0.020558955147862434, "kl": 0.012400150299072266, "learning_rate": 9.035344019648701e-06, "loss": -0.0201, "num_tokens": 91228273.0, "reward": 0.4610043466091156, "reward_std": 0.06945549696683884, "rewards/gemini_judge_reward_func/mean": 0.1584821492433548, "rewards/gemini_judge_reward_func/std": 0.2750774919986725, "rewards/semantic_correctness_reward_func/mean": 0.4508070945739746, "rewards/semantic_correctness_reward_func/std": 0.2163994014263153, "rewards/xmlcount_reward_func/mean": 0.7686249613761902, "rewards/xmlcount_reward_func/std": 0.42356839776039124, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 172.97769165039062, "completions/mean_terminated_length": 153.54794311523438, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.08774700183517563, "grad_norm": 0.021823478862643242, "kl": 0.012814998626708984, "learning_rate": 9.025558937546987e-06, "loss": -0.0005, "num_tokens": 91594656.0, "reward": 0.47361400723457336, "reward_std": 0.0772940143942833, "rewards/gemini_judge_reward_func/mean": 0.1395089328289032, "rewards/gemini_judge_reward_func/std": 0.24901461601257324, "rewards/semantic_correctness_reward_func/mean": 0.444551944732666, "rewards/semantic_correctness_reward_func/std": 0.21275341510772705, "rewards/xmlcount_reward_func/mean": 0.8222500681877136, "rewards/xmlcount_reward_func/std": 0.3842346966266632, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 172.4107208251953, "completions/mean_terminated_length": 148.97247314453125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.08808842985788058, "grad_norm": 0.020453812554478645, "kl": 0.013701677322387695, "learning_rate": 9.015729832577681e-06, "loss": 0.0261, "num_tokens": 91975412.0, "reward": 0.4378080368041992, "reward_std": 0.08023884147405624, "rewards/gemini_judge_reward_func/mean": 0.1551339328289032, "rewards/gemini_judge_reward_func/std": 0.28348881006240845, "rewards/semantic_correctness_reward_func/mean": 0.4487721025943756, "rewards/semantic_correctness_reward_func/std": 0.23882971704006195, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 167.9866180419922, "completions/mean_terminated_length": 156.36651611328125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.08842985788058555, "grad_norm": 0.020137522369623184, "kl": 0.013067245483398438, "learning_rate": 9.005856812230304e-06, "loss": -0.0241, "num_tokens": 92304397.0, "reward": 0.4488251805305481, "reward_std": 0.04712558910250664, "rewards/gemini_judge_reward_func/mean": 0.1037946417927742, "rewards/gemini_judge_reward_func/std": 0.21201863884925842, "rewards/semantic_correctness_reward_func/mean": 0.42778635025024414, "rewards/semantic_correctness_reward_func/std": 0.20638985931873322, "rewards/xmlcount_reward_func/mean": 0.8043751120567322, "rewards/xmlcount_reward_func/std": 0.39858436584472656, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 180.9419708251953, "completions/mean_terminated_length": 161.69406127929688, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.08877128590329052, "grad_norm": 0.020242715254426003, "kl": 0.012917518615722656, "learning_rate": 8.995939984474624e-06, "loss": 0.0083, "num_tokens": 92643564.0, "reward": 0.4373071789741516, "reward_std": 0.06374209374189377, "rewards/gemini_judge_reward_func/mean": 0.1104910746216774, "rewards/gemini_judge_reward_func/std": 0.22035281360149384, "rewards/semantic_correctness_reward_func/mean": 0.42386600375175476, "rewards/semantic_correctness_reward_func/std": 0.19683243334293365, "rewards/xmlcount_reward_func/mean": 0.7708438038825989, "rewards/xmlcount_reward_func/std": 0.4208168685436249, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 163.7544708251953, "completions/mean_terminated_length": 148.11363220214844, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.08911271392599547, "grad_norm": 0.02077900432050228, "kl": 0.011655330657958984, "learning_rate": 8.98597945775948e-06, "loss": 0.0103, "num_tokens": 92977885.0, "reward": 0.4542004466056824, "reward_std": 0.0714111477136612, "rewards/gemini_judge_reward_func/mean": 0.1316964328289032, "rewards/gemini_judge_reward_func/std": 0.2330818623304367, "rewards/semantic_correctness_reward_func/mean": 0.43685027956962585, "rewards/semantic_correctness_reward_func/std": 0.2025267481803894, "rewards/xmlcount_reward_func/mean": 0.7853795289993286, "rewards/xmlcount_reward_func/std": 0.41141119599342346, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 151.34375, "completions/mean_terminated_length": 147.4304962158203, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.08945414194870044, "grad_norm": 0.022191418334841728, "kl": 0.013633251190185547, "learning_rate": 8.975975341011595e-06, "loss": -0.0129, "num_tokens": 93303462.0, "reward": 0.4570338726043701, "reward_std": 0.07553589344024658, "rewards/gemini_judge_reward_func/mean": 0.1194196417927742, "rewards/gemini_judge_reward_func/std": 0.22453764081001282, "rewards/semantic_correctness_reward_func/mean": 0.4375797212123871, "rewards/semantic_correctness_reward_func/std": 0.1922610104084015, "rewards/xmlcount_reward_func/mean": 0.8043750524520874, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 161.97769165039062, "completions/mean_terminated_length": 154.2117156982422, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.0897955699714054, "grad_norm": 0.01960793137550354, "kl": 0.01739645004272461, "learning_rate": 8.96592774363439e-06, "loss": -0.0111, "num_tokens": 93676765.0, "reward": 0.3955410122871399, "reward_std": 0.06413434445858002, "rewards/gemini_judge_reward_func/mean": 0.1004464253783226, "rewards/gemini_judge_reward_func/std": 0.21559134125709534, "rewards/semantic_correctness_reward_func/mean": 0.41831207275390625, "rewards/semantic_correctness_reward_func/std": 0.17913205921649933, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 159.05357360839844, "completions/mean_terminated_length": 155.17489624023438, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.09013699799411036, "grad_norm": 0.020866382867097855, "kl": 0.012538671493530273, "learning_rate": 8.955836775506776e-06, "loss": -0.0182, "num_tokens": 94019389.0, "reward": 0.4702732563018799, "reward_std": 0.07895836979150772, "rewards/gemini_judge_reward_func/mean": 0.1473214328289032, "rewards/gemini_judge_reward_func/std": 0.2731512784957886, "rewards/semantic_correctness_reward_func/mean": 0.4479733109474182, "rewards/semantic_correctness_reward_func/std": 0.2174108326435089, "rewards/xmlcount_reward_func/mean": 0.8043750524520874, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 185.57589721679688, "completions/mean_terminated_length": 162.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.09047842601681533, "grad_norm": 0.019125469028949738, "kl": 0.013179779052734375, "learning_rate": 8.94570254698197e-06, "loss": -0.0534, "num_tokens": 94374994.0, "reward": 0.41334468126296997, "reward_std": 0.0682515949010849, "rewards/gemini_judge_reward_func/mean": 0.0904017835855484, "rewards/gemini_judge_reward_func/std": 0.23300401866436005, "rewards/semantic_correctness_reward_func/mean": 0.38441964983940125, "rewards/semantic_correctness_reward_func/std": 0.19101744890213013, "rewards/xmlcount_reward_func/mean": 0.7507500648498535, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 167.33482360839844, "completions/mean_terminated_length": 155.7058868408203, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.0908198540395203, "grad_norm": 0.020079247653484344, "kl": 0.015445232391357422, "learning_rate": 8.935525168886263e-06, "loss": -0.0174, "num_tokens": 94729441.0, "reward": 0.4319685995578766, "reward_std": 0.05235441401600838, "rewards/gemini_judge_reward_func/mean": 0.109375, "rewards/gemini_judge_reward_func/std": 0.2290094941854477, "rewards/semantic_correctness_reward_func/mean": 0.40384286642074585, "rewards/semantic_correctness_reward_func/std": 0.20472340285778046, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 161.7366180419922, "completions/mean_terminated_length": 153.96847534179688, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.09116128206222526, "grad_norm": 0.02106994017958641, "kl": 0.015232563018798828, "learning_rate": 8.92530475251784e-06, "loss": -0.0094, "num_tokens": 95086954.0, "reward": 0.4232273995876312, "reward_std": 0.07202961295843124, "rewards/gemini_judge_reward_func/mean": 0.1540178507566452, "rewards/gemini_judge_reward_func/std": 0.2704537510871887, "rewards/semantic_correctness_reward_func/mean": 0.44960108399391174, "rewards/semantic_correctness_reward_func/std": 0.2044912576675415, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 164.90625, "completions/mean_terminated_length": 149.28636169433594, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.09150271008493022, "grad_norm": 0.02133958414196968, "kl": 0.014173030853271484, "learning_rate": 8.91504140964553e-06, "loss": -0.0226, "num_tokens": 95457245.0, "reward": 0.446917325258255, "reward_std": 0.06879051774740219, "rewards/gemini_judge_reward_func/mean": 0.1160714253783226, "rewards/gemini_judge_reward_func/std": 0.23180073499679565, "rewards/semantic_correctness_reward_func/mean": 0.4235150218009949, "rewards/semantic_correctness_reward_func/std": 0.21951408684253693, "rewards/xmlcount_reward_func/mean": 0.7894643545150757, "rewards/xmlcount_reward_func/std": 0.4070565700531006, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 163.02679443359375, "completions/mean_terminated_length": 151.33937072753906, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.09184413810763518, "grad_norm": 0.02040465921163559, "kl": 0.014896392822265625, "learning_rate": 8.90473525250761e-06, "loss": -0.005, "num_tokens": 95823515.0, "reward": 0.44515278935432434, "reward_std": 0.0725380927324295, "rewards/gemini_judge_reward_func/mean": 0.1417410671710968, "rewards/gemini_judge_reward_func/std": 0.26416993141174316, "rewards/semantic_correctness_reward_func/mean": 0.4765315651893616, "rewards/semantic_correctness_reward_func/std": 0.21309268474578857, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427841901779175, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 174.46429443359375, "completions/mean_terminated_length": 151.0825653076172, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.09218556613034015, "grad_norm": 0.020097464323043823, "kl": 0.018402576446533203, "learning_rate": 8.894386393810563e-06, "loss": -0.011, "num_tokens": 96182759.0, "reward": 0.3942418396472931, "reward_std": 0.0597333200275898, "rewards/gemini_judge_reward_func/mean": 0.1395089328289032, "rewards/gemini_judge_reward_func/std": 0.2797568440437317, "rewards/semantic_correctness_reward_func/mean": 0.4409410059452057, "rewards/semantic_correctness_reward_func/std": 0.21194607019424438, "rewards/xmlcount_reward_func/mean": 0.6256250739097595, "rewards/xmlcount_reward_func/std": 0.48569241166114807, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 161.52232360839844, "completions/mean_terminated_length": 153.75225830078125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.0925269941530451, "grad_norm": 0.021590745076537132, "kl": 0.016841650009155273, "learning_rate": 8.883994946727848e-06, "loss": -0.0153, "num_tokens": 96539908.0, "reward": 0.4480898380279541, "reward_std": 0.060927197337150574, "rewards/gemini_judge_reward_func/mean": 0.1216517835855484, "rewards/gemini_judge_reward_func/std": 0.2379140406847, "rewards/semantic_correctness_reward_func/mean": 0.4241454303264618, "rewards/semantic_correctness_reward_func/std": 0.20089684426784515, "rewards/xmlcount_reward_func/mean": 0.786500096321106, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 153.83929443359375, "completions/mean_terminated_length": 146.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.09286842217575007, "grad_norm": 0.021431416273117065, "kl": 0.015526533126831055, "learning_rate": 8.873561024898668e-06, "loss": -0.0014, "num_tokens": 96885900.0, "reward": 0.4165668785572052, "reward_std": 0.05786097049713135, "rewards/gemini_judge_reward_func/mean": 0.1071428582072258, "rewards/gemini_judge_reward_func/std": 0.2136441320180893, "rewards/semantic_correctness_reward_func/mean": 0.40279844403266907, "rewards/semantic_correctness_reward_func/std": 0.18588510155677795, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 1024.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 190.33482360839844, "completions/mean_terminated_length": 159.4583282470703, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.09320985019845504, "grad_norm": 0.01978684961795807, "kl": 0.016774654388427734, "learning_rate": 8.863084742426719e-06, "loss": -0.0157, "num_tokens": 97254931.0, "reward": 0.4032873511314392, "reward_std": 0.05024518445134163, "rewards/gemini_judge_reward_func/mean": 0.1261160671710968, "rewards/gemini_judge_reward_func/std": 0.2593517303466797, "rewards/semantic_correctness_reward_func/mean": 0.4146420657634735, "rewards/semantic_correctness_reward_func/std": 0.1971837282180786, "rewards/xmlcount_reward_func/mean": 0.6747812628746033, "rewards/xmlcount_reward_func/std": 0.4702270030975342, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 184.3928680419922, "completions/mean_terminated_length": 161.28439331054688, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.09355127822116001, "grad_norm": 0.02010081149637699, "kl": 0.017665863037109375, "learning_rate": 8.852566213878947e-06, "loss": -0.0007, "num_tokens": 97643595.0, "reward": 0.39674344658851624, "reward_std": 0.074442058801651, "rewards/gemini_judge_reward_func/mean": 0.1517857164144516, "rewards/gemini_judge_reward_func/std": 0.27581340074539185, "rewards/semantic_correctness_reward_func/mean": 0.4288957417011261, "rewards/semantic_correctness_reward_func/std": 0.21292337775230408, "rewards/xmlcount_reward_func/mean": 0.6256250739097595, "rewards/xmlcount_reward_func/std": 0.48569241166114807, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 173.38394165039062, "completions/mean_terminated_length": 161.8371124267578, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.09389270624386496, "grad_norm": 0.020230090245604515, "kl": 0.012257099151611328, "learning_rate": 8.842005554284296e-06, "loss": 0.0054, "num_tokens": 97985765.0, "reward": 0.44697558879852295, "reward_std": 0.05089180916547775, "rewards/gemini_judge_reward_func/mean": 0.1037946417927742, "rewards/gemini_judge_reward_func/std": 0.19835957884788513, "rewards/semantic_correctness_reward_func/mean": 0.4185386300086975, "rewards/semantic_correctness_reward_func/std": 0.20165562629699707, "rewards/xmlcount_reward_func/mean": 0.8043750524520874, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 147.79464721679688, "completions/mean_terminated_length": 147.79464721679688, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.09423413426656993, "grad_norm": 0.02072894014418125, "kl": 0.022124528884887695, "learning_rate": 8.831402879132447e-06, "loss": -0.0036, "num_tokens": 98334511.0, "reward": 0.4387449026107788, "reward_std": 0.06641606986522675, "rewards/gemini_judge_reward_func/mean": 0.1350446492433548, "rewards/gemini_judge_reward_func/std": 0.25258663296699524, "rewards/semantic_correctness_reward_func/mean": 0.42213496565818787, "rewards/semantic_correctness_reward_func/std": 0.22363466024398804, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 173.51339721679688, "completions/mean_terminated_length": 158.04998779296875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.0945755622892749, "grad_norm": 0.019913461059331894, "kl": 0.012322187423706055, "learning_rate": 8.820758304372557e-06, "loss": -0.0162, "num_tokens": 98659610.0, "reward": 0.4966113567352295, "reward_std": 0.062031567096710205, "rewards/gemini_judge_reward_func/mean": 0.1506696492433548, "rewards/gemini_judge_reward_func/std": 0.25807496905326843, "rewards/semantic_correctness_reward_func/mean": 0.46571722626686096, "rewards/semantic_correctness_reward_func/std": 0.22223028540611267, "rewards/xmlcount_reward_func/mean": 0.8580000996589661, "rewards/xmlcount_reward_func/std": 0.35106155276298523, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 1024.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 184.04464721679688, "completions/mean_terminated_length": 152.9351806640625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.09491699031197985, "grad_norm": 0.01981070265173912, "kl": 0.014884471893310547, "learning_rate": 8.810071946411989e-06, "loss": -0.0066, "num_tokens": 99051748.0, "reward": 0.3772115409374237, "reward_std": 0.07925941050052643, "rewards/gemini_judge_reward_func/mean": 0.1272321492433548, "rewards/gemini_judge_reward_func/std": 0.2614969313144684, "rewards/semantic_correctness_reward_func/mean": 0.43397727608680725, "rewards/semantic_correctness_reward_func/std": 0.20137180387973785, "rewards/xmlcount_reward_func/mean": 0.5988079905509949, "rewards/xmlcount_reward_func/std": 0.4918448030948639, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 178.5357208251953, "completions/mean_terminated_length": 163.16363525390625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.09525841833468482, "grad_norm": 0.020953577011823654, "kl": 0.015062332153320312, "learning_rate": 8.799343922115045e-06, "loss": -0.0112, "num_tokens": 99401472.0, "reward": 0.44807708263397217, "reward_std": 0.06729375571012497, "rewards/gemini_judge_reward_func/mean": 0.1417410671710968, "rewards/gemini_judge_reward_func/std": 0.23616158962249756, "rewards/semantic_correctness_reward_func/mean": 0.4419924318790436, "rewards/semantic_correctness_reward_func/std": 0.1812608540058136, "rewards/xmlcount_reward_func/mean": 0.7574554085731506, "rewards/xmlcount_reward_func/std": 0.4265342950820923, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 179.24107360839844, "completions/mean_terminated_length": 155.99081420898438, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.09559984635738979, "grad_norm": 0.01885247975587845, "kl": 0.014855623245239258, "learning_rate": 8.788574348801676e-06, "loss": -0.0195, "num_tokens": 99751306.0, "reward": 0.4536531865596771, "reward_std": 0.06807014346122742, "rewards/gemini_judge_reward_func/mean": 0.140625, "rewards/gemini_judge_reward_func/std": 0.24781839549541473, "rewards/semantic_correctness_reward_func/mean": 0.43636396527290344, "rewards/semantic_correctness_reward_func/std": 0.21698756515979767, "rewards/xmlcount_reward_func/mean": 0.7753258943557739, "rewards/xmlcount_reward_func/std": 0.4178903102874756, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 180.2366180419922, "completions/mean_terminated_length": 157.0137481689453, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.09594127438009474, "grad_norm": 0.01885247975587845, "kl": 0.014781713485717773, "learning_rate": 8.788574348801676e-06, "loss": -0.0174, "num_tokens": 100103247.0, "reward": 0.43649476766586304, "reward_std": 0.06695149838924408, "rewards/gemini_judge_reward_func/mean": 0.1395089328289032, "rewards/gemini_judge_reward_func/std": 0.28373584151268005, "rewards/semantic_correctness_reward_func/mean": 0.43770572543144226, "rewards/semantic_correctness_reward_func/std": 0.22858086228370667, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 790.0, "completions/mean_length": 165.84375, "completions/mean_terminated_length": 158.11260986328125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.09628270240279971, "grad_norm": 0.019319765269756317, "kl": 0.013626575469970703, "learning_rate": 8.777763344246209e-06, "loss": -0.0305, "num_tokens": 100475300.0, "reward": 0.3688974678516388, "reward_std": 0.06074400618672371, "rewards/gemini_judge_reward_func/mean": 0.1082589253783226, "rewards/gemini_judge_reward_func/std": 0.20158810913562775, "rewards/semantic_correctness_reward_func/mean": 0.4124692380428314, "rewards/semantic_correctness_reward_func/std": 0.19219088554382324, "rewards/xmlcount_reward_func/mean": 0.6077499985694885, "rewards/xmlcount_reward_func/std": 0.48996883630752563, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 172.38394165039062, "completions/mean_terminated_length": 156.89999389648438, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.09662413042550468, "grad_norm": 0.020828846842050552, "kl": 0.018453598022460938, "learning_rate": 8.766911026676063e-06, "loss": -0.0088, "num_tokens": 100845754.0, "reward": 0.44821447134017944, "reward_std": 0.07275271415710449, "rewards/gemini_judge_reward_func/mean": 0.1763392835855484, "rewards/gemini_judge_reward_func/std": 0.2763844132423401, "rewards/semantic_correctness_reward_func/mean": 0.45839372277259827, "rewards/semantic_correctness_reward_func/std": 0.2118861824274063, "rewards/xmlcount_reward_func/mean": 0.7150000929832458, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 176.25894165039062, "completions/mean_terminated_length": 156.9040985107422, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.09696555844820964, "grad_norm": 0.021423369646072388, "kl": 0.01433420181274414, "learning_rate": 8.756017514770444e-06, "loss": -0.0316, "num_tokens": 101213308.0, "reward": 0.3927696943283081, "reward_std": 0.05550656095147133, "rewards/gemini_judge_reward_func/mean": 0.0803571417927742, "rewards/gemini_judge_reward_func/std": 0.1809735894203186, "rewards/semantic_correctness_reward_func/mean": 0.3821161687374115, "rewards/semantic_correctness_reward_func/std": 0.18534725904464722, "rewards/xmlcount_reward_func/mean": 0.7105089426040649, "rewards/xmlcount_reward_func/std": 0.45530179142951965, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 167.0357208251953, "completions/mean_terminated_length": 159.31532287597656, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.0973069864709146, "grad_norm": 0.01976688578724861, "kl": 0.013885021209716797, "learning_rate": 8.745082927659048e-06, "loss": -0.0169, "num_tokens": 101551900.0, "reward": 0.459250271320343, "reward_std": 0.07970133423805237, "rewards/gemini_judge_reward_func/mean": 0.1361607164144516, "rewards/gemini_judge_reward_func/std": 0.26340386271476746, "rewards/semantic_correctness_reward_func/mean": 0.4509297311306, "rewards/semantic_correctness_reward_func/std": 0.2095455825328827, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 168.63839721679688, "completions/mean_terminated_length": 145.09632873535156, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.09764841449361956, "grad_norm": 0.024606870487332344, "kl": 0.015841007232666016, "learning_rate": 8.734107384920771e-06, "loss": -0.018, "num_tokens": 101920447.0, "reward": 0.4331532418727875, "reward_std": 0.0668218731880188, "rewards/gemini_judge_reward_func/mean": 0.1272321492433548, "rewards/gemini_judge_reward_func/std": 0.24259328842163086, "rewards/semantic_correctness_reward_func/mean": 0.4455518424510956, "rewards/semantic_correctness_reward_func/std": 0.21146556735038757, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 149.97769165039062, "completions/mean_terminated_length": 149.97769165039062, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.09798984251632453, "grad_norm": 0.02012326754629612, "kl": 0.014970779418945312, "learning_rate": 8.72309100658239e-06, "loss": -0.0045, "num_tokens": 102259354.0, "reward": 0.4638066589832306, "reward_std": 0.06380105763673782, "rewards/gemini_judge_reward_func/mean": 0.1350446492433548, "rewards/gemini_judge_reward_func/std": 0.2305423468351364, "rewards/semantic_correctness_reward_func/mean": 0.4402652382850647, "rewards/semantic_correctness_reward_func/std": 0.22478193044662476, "rewards/xmlcount_reward_func/mean": 0.8043392896652222, "rewards/xmlcount_reward_func/std": 0.3985668420791626, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 181.07144165039062, "completions/mean_terminated_length": 161.82647705078125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.09833127053902949, "grad_norm": 0.02045246586203575, "kl": 0.014268636703491211, "learning_rate": 8.71203391311725e-06, "loss": -0.0148, "num_tokens": 102623894.0, "reward": 0.42740973830223083, "reward_std": 0.06286334991455078, "rewards/gemini_judge_reward_func/mean": 0.1049107164144516, "rewards/gemini_judge_reward_func/std": 0.2199123203754425, "rewards/semantic_correctness_reward_func/mean": 0.4346645176410675, "rewards/semantic_correctness_reward_func/std": 0.19024407863616943, "rewards/xmlcount_reward_func/mean": 0.7462812662124634, "rewards/xmlcount_reward_func/std": 0.4369716942310333, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 165.91519165039062, "completions/mean_terminated_length": 158.1846923828125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.09867269856173445, "grad_norm": 0.02105833776295185, "kl": 0.012676715850830078, "learning_rate": 8.700936225443958e-06, "loss": -0.0097, "num_tokens": 102977455.0, "reward": 0.4517498314380646, "reward_std": 0.05953366681933403, "rewards/gemini_judge_reward_func/mean": 0.1082589253783226, "rewards/gemini_judge_reward_func/std": 0.22770272195339203, "rewards/semantic_correctness_reward_func/mean": 0.4334811270236969, "rewards/semantic_correctness_reward_func/std": 0.2211431860923767, "rewards/xmlcount_reward_func/mean": 0.8043750524520874, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 169.62054443359375, "completions/mean_terminated_length": 154.08636474609375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.09901412658443942, "grad_norm": 0.021059256047010422, "kl": 0.017063140869140625, "learning_rate": 8.689798064925049e-06, "loss": -0.0157, "num_tokens": 103329290.0, "reward": 0.42608827352523804, "reward_std": 0.0673830509185791, "rewards/gemini_judge_reward_func/mean": 0.0892857164144516, "rewards/gemini_judge_reward_func/std": 0.21532420814037323, "rewards/semantic_correctness_reward_func/mean": 0.44812875986099243, "rewards/semantic_correctness_reward_func/std": 0.20307044684886932, "rewards/xmlcount_reward_func/mean": 0.7518705725669861, "rewards/xmlcount_reward_func/std": 0.4314948618412018, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 157.41964721679688, "completions/mean_terminated_length": 149.61260986328125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.09935555460714438, "grad_norm": 0.02095315419137478, "kl": 0.016399383544921875, "learning_rate": 8.67861955336566e-06, "loss": -0.0011, "num_tokens": 103694592.0, "reward": 0.44059500098228455, "reward_std": 0.07820717245340347, "rewards/gemini_judge_reward_func/mean": 0.1283482164144516, "rewards/gemini_judge_reward_func/std": 0.25276488065719604, "rewards/semantic_correctness_reward_func/mean": 0.40902838110923767, "rewards/semantic_correctness_reward_func/std": 0.2408311814069748, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 176.21429443359375, "completions/mean_terminated_length": 160.79998779296875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.09969698262984934, "grad_norm": 0.019932152703404427, "kl": 0.012141227722167969, "learning_rate": 8.6674008130122e-06, "loss": -0.0288, "num_tokens": 104034788.0, "reward": 0.46020758152008057, "reward_std": 0.06737792491912842, "rewards/gemini_judge_reward_func/mean": 0.1629464328289032, "rewards/gemini_judge_reward_func/std": 0.27347174286842346, "rewards/semantic_correctness_reward_func/mean": 0.47364482283592224, "rewards/semantic_correctness_reward_func/std": 0.21396541595458984, "rewards/xmlcount_reward_func/mean": 0.7507500648498535, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 156.05357360839844, "completions/mean_terminated_length": 148.23423767089844, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.10003841065255431, "grad_norm": 0.021158162504434586, "kl": 0.016646862030029297, "learning_rate": 8.65614196655102e-06, "loss": 0.0125, "num_tokens": 104408036.0, "reward": 0.3966463506221771, "reward_std": 0.0536530539393425, "rewards/gemini_judge_reward_func/mean": 0.0993303582072258, "rewards/gemini_judge_reward_func/std": 0.20203447341918945, "rewards/semantic_correctness_reward_func/mean": 0.43345481157302856, "rewards/semantic_correctness_reward_func/std": 0.19227315485477448, "rewards/xmlcount_reward_func/mean": 0.6755580902099609, "rewards/xmlcount_reward_func/std": 0.4676108658313751, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 877.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 157.34375, "completions/mean_terminated_length": 157.34375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.10037983867525928, "grad_norm": 0.021672172471880913, "kl": 0.013187885284423828, "learning_rate": 8.644843137107058e-06, "loss": -0.0071, "num_tokens": 104742949.0, "reward": 0.48235565423965454, "reward_std": 0.07993865013122559, "rewards/gemini_judge_reward_func/mean": 0.1506696492433548, "rewards/gemini_judge_reward_func/std": 0.25258663296699524, "rewards/semantic_correctness_reward_func/mean": 0.4301888048648834, "rewards/semantic_correctness_reward_func/std": 0.23608912527561188, "rewards/xmlcount_reward_func/mean": 0.8401250839233398, "rewards/xmlcount_reward_func/std": 0.3684578537940979, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 160.49554443359375, "completions/mean_terminated_length": 148.7737579345703, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.10072126669796423, "grad_norm": 0.019521746784448624, "kl": 0.015337467193603516, "learning_rate": 8.633504448242504e-06, "loss": -0.0161, "num_tokens": 105137996.0, "reward": 0.4107567369937897, "reward_std": 0.07504715025424957, "rewards/gemini_judge_reward_func/mean": 0.1495535671710968, "rewards/gemini_judge_reward_func/std": 0.2677757143974304, "rewards/semantic_correctness_reward_func/mean": 0.4689352810382843, "rewards/semantic_correctness_reward_func/std": 0.22458945214748383, "rewards/xmlcount_reward_func/mean": 0.6428705453872681, "rewards/xmlcount_reward_func/std": 0.47865068912506104, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 157.625, "completions/mean_terminated_length": 153.7399139404297, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1010626947206692, "grad_norm": 0.0203217975795269, "kl": 0.014668941497802734, "learning_rate": 8.622126023955446e-06, "loss": 0.0034, "num_tokens": 105495484.0, "reward": 0.45263081789016724, "reward_std": 0.07190153002738953, "rewards/gemini_judge_reward_func/mean": 0.1183035746216774, "rewards/gemini_judge_reward_func/std": 0.2306644320487976, "rewards/semantic_correctness_reward_func/mean": 0.4177968204021454, "rewards/semantic_correctness_reward_func/std": 0.22075255215168, "rewards/xmlcount_reward_func/mean": 0.8043751120567322, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 174.09376525878906, "completions/mean_terminated_length": 150.7018280029297, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.10140412274337417, "grad_norm": 0.021145980805158615, "kl": 0.015403032302856445, "learning_rate": 8.610707988678504e-06, "loss": -0.0054, "num_tokens": 105849873.0, "reward": 0.42827874422073364, "reward_std": 0.07740958034992218, "rewards/gemini_judge_reward_func/mean": 0.1328125, "rewards/gemini_judge_reward_func/std": 0.2436303198337555, "rewards/semantic_correctness_reward_func/mean": 0.4412953555583954, "rewards/semantic_correctness_reward_func/std": 0.19017373025417328, "rewards/xmlcount_reward_func/mean": 0.7172366380691528, "rewards/xmlcount_reward_func/std": 0.4509044885635376, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 157.50894165039062, "completions/mean_terminated_length": 149.7027130126953, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.10174555076607912, "grad_norm": 0.020420530810952187, "kl": 0.01408076286315918, "learning_rate": 8.599250467277483e-06, "loss": -0.0364, "num_tokens": 106188755.0, "reward": 0.4316641092300415, "reward_std": 0.05426663160324097, "rewards/gemini_judge_reward_func/mean": 0.1227678582072258, "rewards/gemini_judge_reward_func/std": 0.2270781695842743, "rewards/semantic_correctness_reward_func/mean": 0.4112846255302429, "rewards/semantic_correctness_reward_func/std": 0.19838160276412964, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 172.80357360839844, "completions/mean_terminated_length": 153.36985778808594, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.10208697878878409, "grad_norm": 0.020977923646569252, "kl": 0.014760017395019531, "learning_rate": 8.587753585050004e-06, "loss": -0.0314, "num_tokens": 106526319.0, "reward": 0.4237655997276306, "reward_std": 0.07243627309799194, "rewards/gemini_judge_reward_func/mean": 0.1283482164144516, "rewards/gemini_judge_reward_func/std": 0.264680951833725, "rewards/semantic_correctness_reward_func/mean": 0.3919081389904022, "rewards/semantic_correctness_reward_func/std": 0.21118712425231934, "rewards/xmlcount_reward_func/mean": 0.735111653804779, "rewards/xmlcount_reward_func/std": 0.4418267011642456, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 157.35269165039062, "completions/mean_terminated_length": 153.46636962890625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.10242840681148906, "grad_norm": 0.01886204071342945, "kl": 0.012836217880249023, "learning_rate": 8.576217467724129e-06, "loss": -0.0136, "num_tokens": 106879998.0, "reward": 0.4238353371620178, "reward_std": 0.06256872415542603, "rewards/gemini_judge_reward_func/mean": 0.1294642835855484, "rewards/gemini_judge_reward_func/std": 0.2527475655078888, "rewards/semantic_correctness_reward_func/mean": 0.43024787306785583, "rewards/semantic_correctness_reward_func/std": 0.22388145327568054, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 166.91964721679688, "completions/mean_terminated_length": 155.2850799560547, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.10276983483419401, "grad_norm": 0.021645022556185722, "kl": 0.013919830322265625, "learning_rate": 8.564642241456986e-06, "loss": -0.0271, "num_tokens": 107232332.0, "reward": 0.47711122035980225, "reward_std": 0.07099025696516037, "rewards/gemini_judge_reward_func/mean": 0.1584821492433548, "rewards/gemini_judge_reward_func/std": 0.30040243268013, "rewards/semantic_correctness_reward_func/mean": 0.4598415791988373, "rewards/semantic_correctness_reward_func/std": 0.22269363701343536, "rewards/xmlcount_reward_func/mean": 0.8043751120567322, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 1024.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 178.12501525878906, "completions/mean_terminated_length": 146.79629516601562, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.10311126285689898, "grad_norm": 0.02136322483420372, "kl": 0.013774394989013672, "learning_rate": 8.553028032833397e-06, "loss": 0.0098, "num_tokens": 107599752.0, "reward": 0.4435153007507324, "reward_std": 0.07765571027994156, "rewards/gemini_judge_reward_func/mean": 0.1741071492433548, "rewards/gemini_judge_reward_func/std": 0.2954044044017792, "rewards/semantic_correctness_reward_func/mean": 0.45273685455322266, "rewards/semantic_correctness_reward_func/std": 0.21585151553153992, "rewards/xmlcount_reward_func/mean": 0.7083125114440918, "rewards/xmlcount_reward_func/std": 0.45263010263442993, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 154.9375, "completions/mean_terminated_length": 147.1081085205078, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.10345269087960395, "grad_norm": 0.020963052287697792, "kl": 0.012014389038085938, "learning_rate": 8.541374968864486e-06, "loss": -0.0102, "num_tokens": 107968010.0, "reward": 0.42616450786590576, "reward_std": 0.05278439447283745, "rewards/gemini_judge_reward_func/mean": 0.109375, "rewards/gemini_judge_reward_func/std": 0.22280631959438324, "rewards/semantic_correctness_reward_func/mean": 0.4488045871257782, "rewards/semantic_correctness_reward_func/std": 0.1997697800397873, "rewards/xmlcount_reward_func/mean": 0.7316340208053589, "rewards/xmlcount_reward_func/std": 0.442789226770401, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 154.2991180419922, "completions/mean_terminated_length": 142.49322509765625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.10379411890230891, "grad_norm": 0.02215876430273056, "kl": 0.01742839813232422, "learning_rate": 8.529683176986295e-06, "loss": -0.0091, "num_tokens": 108318421.0, "reward": 0.4235052466392517, "reward_std": 0.06170068308711052, "rewards/gemini_judge_reward_func/mean": 0.1261160671710968, "rewards/gemini_judge_reward_func/std": 0.22584888339042664, "rewards/semantic_correctness_reward_func/mean": 0.4352940022945404, "rewards/semantic_correctness_reward_func/std": 0.19691424071788788, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 164.59375, "completions/mean_terminated_length": 148.96817016601562, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.10413554692501387, "grad_norm": 0.0207134447991848, "kl": 0.012769222259521484, "learning_rate": 8.517952785058385e-06, "loss": -0.0041, "num_tokens": 108649042.0, "reward": 0.45065411925315857, "reward_std": 0.0538845956325531, "rewards/gemini_judge_reward_func/mean": 0.1026785746216774, "rewards/gemini_judge_reward_func/std": 0.19394874572753906, "rewards/semantic_correctness_reward_func/mean": 0.4034132957458496, "rewards/semantic_correctness_reward_func/std": 0.21382492780685425, "rewards/xmlcount_reward_func/mean": 0.8222500085830688, "rewards/xmlcount_reward_func/std": 0.3842346966266632, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 727.0, "completions/mean_length": 166.10714721679688, "completions/mean_terminated_length": 146.52053833007812, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.10447697494771883, "grad_norm": 0.020172296091914177, "kl": 0.014577388763427734, "learning_rate": 8.506183921362443e-06, "loss": -0.0415, "num_tokens": 109025054.0, "reward": 0.4207233786582947, "reward_std": 0.07463856041431427, "rewards/gemini_judge_reward_func/mean": 0.1629464328289032, "rewards/gemini_judge_reward_func/std": 0.2765292227268219, "rewards/semantic_correctness_reward_func/mean": 0.44155433773994446, "rewards/semantic_correctness_reward_func/std": 0.21850642561912537, "rewards/xmlcount_reward_func/mean": 0.6680848002433777, "rewards/xmlcount_reward_func/std": 0.4690874218940735, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 154.96875, "completions/mean_terminated_length": 143.1719512939453, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.1048184029704238, "grad_norm": 0.022241463884711266, "kl": 0.01328420639038086, "learning_rate": 8.494376714600878e-06, "loss": 0.0014, "num_tokens": 109369683.0, "reward": 0.4572017788887024, "reward_std": 0.06673526763916016, "rewards/gemini_judge_reward_func/mean": 0.1350446492433548, "rewards/gemini_judge_reward_func/std": 0.22809800505638123, "rewards/semantic_correctness_reward_func/mean": 0.42502665519714355, "rewards/semantic_correctness_reward_func/std": 0.22096006572246552, "rewards/xmlcount_reward_func/mean": 0.795446515083313, "rewards/xmlcount_reward_func/std": 0.3996833562850952, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 176.94644165039062, "completions/mean_terminated_length": 149.6221160888672, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.10515983099312876, "grad_norm": 0.02019048109650612, "kl": 0.012269735336303711, "learning_rate": 8.482531293895412e-06, "loss": -0.0139, "num_tokens": 109722579.0, "reward": 0.4193563163280487, "reward_std": 0.05619501322507858, "rewards/gemini_judge_reward_func/mean": 0.1439732164144516, "rewards/gemini_judge_reward_func/std": 0.24185720086097717, "rewards/semantic_correctness_reward_func/mean": 0.4145851731300354, "rewards/semantic_correctness_reward_func/std": 0.21506093442440033, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903594970703, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 168.7366180419922, "completions/mean_terminated_length": 157.126708984375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.10550125901583372, "grad_norm": 0.022851206362247467, "kl": 0.013672828674316406, "learning_rate": 8.470647788785665e-06, "loss": -0.0349, "num_tokens": 110095080.0, "reward": 0.4072951674461365, "reward_std": 0.06596186012029648, "rewards/gemini_judge_reward_func/mean": 0.125, "rewards/gemini_judge_reward_func/std": 0.24144557118415833, "rewards/semantic_correctness_reward_func/mean": 0.4279758036136627, "rewards/semantic_correctness_reward_func/std": 0.21440142393112183, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 147.3125, "completions/mean_terminated_length": 143.3811798095703, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.10584268703853869, "grad_norm": 0.023006869480013847, "kl": 0.017367839813232422, "learning_rate": 8.458726329227748e-06, "loss": -0.0159, "num_tokens": 110468182.0, "reward": 0.413924902677536, "reward_std": 0.0590013712644577, "rewards/gemini_judge_reward_func/mean": 0.1138392835855484, "rewards/gemini_judge_reward_func/std": 0.2491879016160965, "rewards/semantic_correctness_reward_func/mean": 0.4119459092617035, "rewards/semantic_correctness_reward_func/std": 0.22772468626499176, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 159.52232360839844, "completions/mean_terminated_length": 147.78733825683594, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.10618411506124366, "grad_norm": 0.02124634012579918, "kl": 0.014283180236816406, "learning_rate": 8.446767045592829e-06, "loss": -0.0032, "num_tokens": 110842627.0, "reward": 0.40707066655158997, "reward_std": 0.058956243097782135, "rewards/gemini_judge_reward_func/mean": 0.1049107164144516, "rewards/gemini_judge_reward_func/std": 0.20404648780822754, "rewards/semantic_correctness_reward_func/mean": 0.39553165435791016, "rewards/semantic_correctness_reward_func/std": 0.20496897399425507, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 144.42857360839844, "completions/mean_terminated_length": 136.50450134277344, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.10652554308394861, "grad_norm": 0.023235971108078957, "kl": 0.017826557159423828, "learning_rate": 8.434770068665723e-06, "loss": -0.02, "num_tokens": 111209299.0, "reward": 0.39658233523368835, "reward_std": 0.0629458948969841, "rewards/gemini_judge_reward_func/mean": 0.1640625, "rewards/gemini_judge_reward_func/std": 0.2691808044910431, "rewards/semantic_correctness_reward_func/mean": 0.4392865300178528, "rewards/semantic_correctness_reward_func/std": 0.20642463862895966, "rewards/xmlcount_reward_func/mean": 0.6077500581741333, "rewards/xmlcount_reward_func/std": 0.48996880650520325, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 174.95982360839844, "completions/mean_terminated_length": 147.57142639160156, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.10686697110665358, "grad_norm": 0.025226525962352753, "kl": 0.013714790344238281, "learning_rate": 8.422735529643445e-06, "loss": -0.0039, "num_tokens": 111573746.0, "reward": 0.4315214157104492, "reward_std": 0.0645042136311531, "rewards/gemini_judge_reward_func/mean": 0.1026785746216774, "rewards/gemini_judge_reward_func/std": 0.22845152020454407, "rewards/semantic_correctness_reward_func/mean": 0.41499972343444824, "rewards/semantic_correctness_reward_func/std": 0.20047926902770996, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 155.8482208251953, "completions/mean_terminated_length": 144.0633544921875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.10720839912935855, "grad_norm": 0.020772015675902367, "kl": 0.014401912689208984, "learning_rate": 8.410663560133784e-06, "loss": -0.0003, "num_tokens": 111937032.0, "reward": 0.4510946273803711, "reward_std": 0.06376608461141586, "rewards/gemini_judge_reward_func/mean": 0.1540178507566452, "rewards/gemini_judge_reward_func/std": 0.25662854313850403, "rewards/semantic_correctness_reward_func/mean": 0.4459371864795685, "rewards/semantic_correctness_reward_func/std": 0.2272828072309494, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 160.9732208251953, "completions/mean_terminated_length": 141.26939392089844, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.1075498271520635, "grad_norm": 0.02254970371723175, "kl": 0.017621517181396484, "learning_rate": 8.398554292153866e-06, "loss": 0.0044, "num_tokens": 112320054.0, "reward": 0.4214918315410614, "reward_std": 0.05463023856282234, "rewards/gemini_judge_reward_func/mean": 0.1339285671710968, "rewards/gemini_judge_reward_func/std": 0.2570280432701111, "rewards/semantic_correctness_reward_func/mean": 0.4453520178794861, "rewards/semantic_correctness_reward_func/std": 0.21047256886959076, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903892993927, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 147.58482360839844, "completions/mean_terminated_length": 139.68919372558594, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.10789125517476847, "grad_norm": 0.023806257173419, "kl": 0.014993429183959961, "learning_rate": 8.386407858128707e-06, "loss": 0.0018, "num_tokens": 112692217.0, "reward": 0.38690313696861267, "reward_std": 0.06407325714826584, "rewards/gemini_judge_reward_func/mean": 0.09375, "rewards/gemini_judge_reward_func/std": 0.19843840599060059, "rewards/semantic_correctness_reward_func/mean": 0.41531017422676086, "rewards/semantic_correctness_reward_func/std": 0.19655534625053406, "rewards/xmlcount_reward_func/mean": 0.6658526659011841, "rewards/xmlcount_reward_func/std": 0.4686855971813202, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 170.46875, "completions/mean_terminated_length": 142.93548583984375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.10823268319747344, "grad_norm": 0.02215094119310379, "kl": 0.016452312469482422, "learning_rate": 8.37422439088976e-06, "loss": -0.0031, "num_tokens": 113080238.0, "reward": 0.4259691536426544, "reward_std": 0.0712801143527031, "rewards/gemini_judge_reward_func/mean": 0.1640625, "rewards/gemini_judge_reward_func/std": 0.2541865110397339, "rewards/semantic_correctness_reward_func/mean": 0.4343097507953644, "rewards/semantic_correctness_reward_func/std": 0.22840382158756256, "rewards/xmlcount_reward_func/mean": 0.6837054491043091, "rewards/xmlcount_reward_func/std": 0.46439453959465027, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 775.0, "completions/mean_length": 166.27679443359375, "completions/mean_terminated_length": 142.6697235107422, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.10857411122017839, "grad_norm": 0.021131092682480812, "kl": 0.017343997955322266, "learning_rate": 8.362004023673473e-06, "loss": -0.0116, "num_tokens": 113452164.0, "reward": 0.3965999484062195, "reward_std": 0.06723373383283615, "rewards/gemini_judge_reward_func/mean": 0.1283482164144516, "rewards/gemini_judge_reward_func/std": 0.24715863168239594, "rewards/semantic_correctness_reward_func/mean": 0.40355318784713745, "rewards/semantic_correctness_reward_func/std": 0.1819203644990921, "rewards/xmlcount_reward_func/mean": 0.6613750457763672, "rewards/xmlcount_reward_func/std": 0.47500187158584595, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 170.62054443359375, "completions/mean_terminated_length": 151.13697814941406, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.10891553924288336, "grad_norm": 0.023129483684897423, "kl": 0.01513051986694336, "learning_rate": 8.349746890119826e-06, "loss": 0.0071, "num_tokens": 113825511.0, "reward": 0.4296068251132965, "reward_std": 0.054870616644620895, "rewards/gemini_judge_reward_func/mean": 0.1283482164144516, "rewards/gemini_judge_reward_func/std": 0.2571618854999542, "rewards/semantic_correctness_reward_func/mean": 0.4613375663757324, "rewards/semantic_correctness_reward_func/std": 0.20210479199886322, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 161.46429443359375, "completions/mean_terminated_length": 145.7818145751953, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.10925696726558833, "grad_norm": 0.021242870017886162, "kl": 0.015431404113769531, "learning_rate": 8.337453124270864e-06, "loss": 0.0266, "num_tokens": 114167051.0, "reward": 0.4469578266143799, "reward_std": 0.07928242534399033, "rewards/gemini_judge_reward_func/mean": 0.1506696492433548, "rewards/gemini_judge_reward_func/std": 0.2936350107192993, "rewards/semantic_correctness_reward_func/mean": 0.3984406888484955, "rewards/semantic_correctness_reward_func/std": 0.2311331182718277, "rewards/xmlcount_reward_func/mean": 0.7675045132637024, "rewards/xmlcount_reward_func/std": 0.42328277230262756, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 161.5357208251953, "completions/mean_terminated_length": 145.8545379638672, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.1095983952882933, "grad_norm": 0.02152419462800026, "kl": 0.016777515411376953, "learning_rate": 8.325122860569241e-06, "loss": -0.004, "num_tokens": 114533951.0, "reward": 0.4059690237045288, "reward_std": 0.05474109575152397, "rewards/gemini_judge_reward_func/mean": 0.1238839253783226, "rewards/gemini_judge_reward_func/std": 0.23317579925060272, "rewards/semantic_correctness_reward_func/mean": 0.4235771596431732, "rewards/semantic_correctness_reward_func/std": 0.21103839576244354, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 161.73214721679688, "completions/mean_terminated_length": 150.02716064453125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.10993982331099825, "grad_norm": 0.021091651171445847, "kl": 0.012056350708007812, "learning_rate": 8.31275623385675e-06, "loss": 0.0014, "num_tokens": 114893815.0, "reward": 0.4708484411239624, "reward_std": 0.06425228714942932, "rewards/gemini_judge_reward_func/mean": 0.1540178507566452, "rewards/gemini_judge_reward_func/std": 0.26837313175201416, "rewards/semantic_correctness_reward_func/mean": 0.4374563992023468, "rewards/semantic_correctness_reward_func/std": 0.2451786994934082, "rewards/xmlcount_reward_func/mean": 0.8043750524520874, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 157.95089721679688, "completions/mean_terminated_length": 138.17807006835938, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.11028125133370321, "grad_norm": 0.02196911908686161, "kl": 0.01731395721435547, "learning_rate": 8.300353379372834e-06, "loss": -0.0236, "num_tokens": 115274700.0, "reward": 0.3980475962162018, "reward_std": 0.05808022618293762, "rewards/gemini_judge_reward_func/mean": 0.1205357164144516, "rewards/gemini_judge_reward_func/std": 0.225807324051857, "rewards/semantic_correctness_reward_func/mean": 0.4579342305660248, "rewards/semantic_correctness_reward_func/std": 0.18658004701137543, "rewards/xmlcount_reward_func/mean": 0.6456161141395569, "rewards/xmlcount_reward_func/std": 0.4789053201675415, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 1024.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 169.99554443359375, "completions/mean_terminated_length": 138.36573791503906, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.11062267935640818, "grad_norm": 0.020508458837866783, "kl": 0.014233112335205078, "learning_rate": 8.287914432753123e-06, "loss": 0.0114, "num_tokens": 115661423.0, "reward": 0.3872678577899933, "reward_std": 0.05701467767357826, "rewards/gemini_judge_reward_func/mean": 0.0803571417927742, "rewards/gemini_judge_reward_func/std": 0.2042548656463623, "rewards/semantic_correctness_reward_func/mean": 0.41712480783462524, "rewards/semantic_correctness_reward_func/std": 0.1884647160768509, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 163.37054443359375, "completions/mean_terminated_length": 143.72145080566406, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.11096410737911314, "grad_norm": 0.01983817107975483, "kl": 0.013721704483032227, "learning_rate": 8.275439530027948e-06, "loss": 0.0108, "num_tokens": 116015134.0, "reward": 0.440729558467865, "reward_std": 0.05826781690120697, "rewards/gemini_judge_reward_func/mean": 0.1830357164144516, "rewards/gemini_judge_reward_func/std": 0.3022378087043762, "rewards/semantic_correctness_reward_func/mean": 0.4790761172771454, "rewards/semantic_correctness_reward_func/std": 0.2402830719947815, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853896975517273, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 170.1919708251953, "completions/mean_terminated_length": 154.66818237304688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1113055354018181, "grad_norm": 0.019503232091665268, "kl": 0.013273954391479492, "learning_rate": 8.262928807620843e-06, "loss": -0.0252, "num_tokens": 116375149.0, "reward": 0.44340530037879944, "reward_std": 0.05507688969373703, "rewards/gemini_judge_reward_func/mean": 0.1127232164144516, "rewards/gemini_judge_reward_func/std": 0.21272562444210052, "rewards/semantic_correctness_reward_func/mean": 0.4498923718929291, "rewards/semantic_correctness_reward_func/std": 0.21063929796218872, "rewards/xmlcount_reward_func/mean": 0.7708438038825989, "rewards/xmlcount_reward_func/std": 0.4208168685436249, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 165.9866180419922, "completions/mean_terminated_length": 142.37155151367188, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.11164696342452307, "grad_norm": 0.02120453119277954, "kl": 0.016252994537353516, "learning_rate": 8.250382402347066e-06, "loss": 0.0012, "num_tokens": 116728550.0, "reward": 0.4338933527469635, "reward_std": 0.06330207735300064, "rewards/gemini_judge_reward_func/mean": 0.0904017835855484, "rewards/gemini_judge_reward_func/std": 0.1992909461259842, "rewards/semantic_correctness_reward_func/mean": 0.41566306352615356, "rewards/semantic_correctness_reward_func/std": 0.1946808099746704, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 142.6428680419922, "completions/mean_terminated_length": 138.6905975341797, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.11198839144722803, "grad_norm": 0.022184768691658974, "kl": 0.015604972839355469, "learning_rate": 8.237800451412095e-06, "loss": -0.0052, "num_tokens": 117078914.0, "reward": 0.4221014678478241, "reward_std": 0.07018353044986725, "rewards/gemini_judge_reward_func/mean": 0.1283482164144516, "rewards/gemini_judge_reward_func/std": 0.2614849805831909, "rewards/semantic_correctness_reward_func/mean": 0.4238108694553375, "rewards/semantic_correctness_reward_func/std": 0.18395309150218964, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 160.99554443359375, "completions/mean_terminated_length": 133.15667724609375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.11232981946993299, "grad_norm": 0.022962870076298714, "kl": 0.016230106353759766, "learning_rate": 8.225183092410128e-06, "loss": -0.0089, "num_tokens": 117432929.0, "reward": 0.38007089495658875, "reward_std": 0.057407211512327194, "rewards/gemini_judge_reward_func/mean": 0.1149553582072258, "rewards/gemini_judge_reward_func/std": 0.22438155114650726, "rewards/semantic_correctness_reward_func/mean": 0.41198840737342834, "rewards/semantic_correctness_reward_func/std": 0.2041979283094406, "rewards/xmlcount_reward_func/mean": 0.6292276978492737, "rewards/xmlcount_reward_func/std": 0.4825986325740814, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 846.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 142.21429443359375, "completions/mean_terminated_length": 142.21429443359375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.11267124749263796, "grad_norm": 0.025767182931303978, "kl": 0.015546798706054688, "learning_rate": 8.212530463322584e-06, "loss": 0.0039, "num_tokens": 117773413.0, "reward": 0.4443262815475464, "reward_std": 0.0675266683101654, "rewards/gemini_judge_reward_func/mean": 0.1517857164144516, "rewards/gemini_judge_reward_func/std": 0.26012489199638367, "rewards/semantic_correctness_reward_func/mean": 0.45230987668037415, "rewards/semantic_correctness_reward_func/std": 0.2161291390657425, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 148.66964721679688, "completions/mean_terminated_length": 140.78378295898438, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.11301267551534293, "grad_norm": 0.022938484326004982, "kl": 0.01628732681274414, "learning_rate": 8.199842702516584e-06, "loss": -0.0092, "num_tokens": 118108139.0, "reward": 0.4258379638195038, "reward_std": 0.04991302639245987, "rewards/gemini_judge_reward_func/mean": 0.1205357164144516, "rewards/gemini_judge_reward_func/std": 0.225807324051857, "rewards/semantic_correctness_reward_func/mean": 0.4223683178424835, "rewards/semantic_correctness_reward_func/std": 0.20527078211307526, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 152.6741180419922, "completions/mean_terminated_length": 144.82432556152344, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.11335410353804788, "grad_norm": 0.02302442491054535, "kl": 0.017078876495361328, "learning_rate": 8.18711994874345e-06, "loss": 0.0042, "num_tokens": 118459982.0, "reward": 0.4548938274383545, "reward_std": 0.07729874551296234, "rewards/gemini_judge_reward_func/mean": 0.1462053507566452, "rewards/gemini_judge_reward_func/std": 0.248531773686409, "rewards/semantic_correctness_reward_func/mean": 0.4805581271648407, "rewards/semantic_correctness_reward_func/std": 0.2108439952135086, "rewards/xmlcount_reward_func/mean": 0.7507500648498535, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 163.9375, "completions/mean_terminated_length": 148.29998779296875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.11369553156075285, "grad_norm": 0.02004287578165531, "kl": 0.013605833053588867, "learning_rate": 8.174362341137177e-06, "loss": -0.0139, "num_tokens": 118833092.0, "reward": 0.46299633383750916, "reward_std": 0.06390392780303955, "rewards/gemini_judge_reward_func/mean": 0.1662946492433548, "rewards/gemini_judge_reward_func/std": 0.2740088105201721, "rewards/semantic_correctness_reward_func/mean": 0.4640171229839325, "rewards/semantic_correctness_reward_func/std": 0.22488431632518768, "rewards/xmlcount_reward_func/mean": 0.7591875791549683, "rewards/xmlcount_reward_func/std": 0.42421701550483704, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 140.8616180419922, "completions/mean_terminated_length": 128.87330627441406, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.11403695958345782, "grad_norm": 0.01933170109987259, "kl": 0.01584768295288086, "learning_rate": 8.161570019212921e-06, "loss": -0.0117, "num_tokens": 119202209.0, "reward": 0.3510363698005676, "reward_std": 0.045246776193380356, "rewards/gemini_judge_reward_func/mean": 0.0837053582072258, "rewards/gemini_judge_reward_func/std": 0.18177036941051483, "rewards/semantic_correctness_reward_func/mean": 0.37227097153663635, "rewards/semantic_correctness_reward_func/std": 0.1903991997241974, "rewards/xmlcount_reward_func/mean": 0.6077500581741333, "rewards/xmlcount_reward_func/std": 0.48996880650520325, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 156.82144165039062, "completions/mean_terminated_length": 145.04977416992188, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.11437838760616277, "grad_norm": 0.020192094147205353, "kl": 0.017360687255859375, "learning_rate": 8.148743122865463e-06, "loss": 0.0078, "num_tokens": 119574049.0, "reward": 0.3997800946235657, "reward_std": 0.06350252032279968, "rewards/gemini_judge_reward_func/mean": 0.1529017835855484, "rewards/gemini_judge_reward_func/std": 0.2915138602256775, "rewards/semantic_correctness_reward_func/mean": 0.4418465793132782, "rewards/semantic_correctness_reward_func/std": 0.21829283237457275, "rewards/xmlcount_reward_func/mean": 0.6256250739097595, "rewards/xmlcount_reward_func/std": 0.48569241166114807, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 149.9732208251953, "completions/mean_terminated_length": 142.09910583496094, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.11471981562886774, "grad_norm": 0.021611372008919716, "kl": 0.017676830291748047, "learning_rate": 8.135881792367686e-06, "loss": 0.0081, "num_tokens": 119918387.0, "reward": 0.4103807508945465, "reward_std": 0.05042886361479759, "rewards/gemini_judge_reward_func/mean": 0.1015625, "rewards/gemini_judge_reward_func/std": 0.20777438580989838, "rewards/semantic_correctness_reward_func/mean": 0.38929662108421326, "rewards/semantic_correctness_reward_func/std": 0.2001960128545761, "rewards/xmlcount_reward_func/mean": 0.729741096496582, "rewards/xmlcount_reward_func/std": 0.4440862536430359, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 160.75, "completions/mean_terminated_length": 152.9729766845703, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.1150612436515727, "grad_norm": 0.020709911361336708, "kl": 0.01323080062866211, "learning_rate": 8.12298616836904e-06, "loss": -0.0084, "num_tokens": 120264343.0, "reward": 0.44805604219436646, "reward_std": 0.06720651686191559, "rewards/gemini_judge_reward_func/mean": 0.1116071417927742, "rewards/gemini_judge_reward_func/std": 0.23990675806999207, "rewards/semantic_correctness_reward_func/mean": 0.44406577944755554, "rewards/semantic_correctness_reward_func/std": 0.20191948115825653, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 148.8125, "completions/mean_terminated_length": 144.8878936767578, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.11540267167427766, "grad_norm": 0.021978365257382393, "kl": 0.014225482940673828, "learning_rate": 8.110056391894005e-06, "loss": 0.0029, "num_tokens": 120599325.0, "reward": 0.44396767020225525, "reward_std": 0.06658326089382172, "rewards/gemini_judge_reward_func/mean": 0.0825892835855484, "rewards/gemini_judge_reward_func/std": 0.19055145978927612, "rewards/semantic_correctness_reward_func/mean": 0.41909700632095337, "rewards/semantic_correctness_reward_func/std": 0.20795229077339172, "rewards/xmlcount_reward_func/mean": 0.8177813291549683, "rewards/xmlcount_reward_func/std": 0.3879494369029999, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 151.71875, "completions/mean_terminated_length": 147.80718994140625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.11574409969698263, "grad_norm": 0.022666750475764275, "kl": 0.017026901245117188, "learning_rate": 8.097092604340543e-06, "loss": -0.004, "num_tokens": 120944658.0, "reward": 0.45621275901794434, "reward_std": 0.07161962240934372, "rewards/gemini_judge_reward_func/mean": 0.1495535671710968, "rewards/gemini_judge_reward_func/std": 0.27193009853363037, "rewards/semantic_correctness_reward_func/mean": 0.43579572439193726, "rewards/semantic_correctness_reward_func/std": 0.23262245953083038, "rewards/xmlcount_reward_func/mean": 0.7730804085731506, "rewards/xmlcount_reward_func/std": 0.41802364587783813, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 153.79464721679688, "completions/mean_terminated_length": 149.8923797607422, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.1160855277196876, "grad_norm": 0.02177649922668934, "kl": 0.016196727752685547, "learning_rate": 8.084094947478556e-06, "loss": -0.0169, "num_tokens": 121279912.0, "reward": 0.4234395921230316, "reward_std": 0.05121118947863579, "rewards/gemini_judge_reward_func/mean": 0.0825892835855484, "rewards/gemini_judge_reward_func/std": 0.18151207268238068, "rewards/semantic_correctness_reward_func/mean": 0.37901926040649414, "rewards/semantic_correctness_reward_func/std": 0.20515023171901703, "rewards/xmlcount_reward_func/mean": 0.7864999771118164, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 146.62054443359375, "completions/mean_terminated_length": 142.6861114501953, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.11642695574239256, "grad_norm": 0.022630201652646065, "kl": 0.01521444320678711, "learning_rate": 8.071063563448341e-06, "loss": 0.0077, "num_tokens": 121619223.0, "reward": 0.4508695602416992, "reward_std": 0.07667838037014008, "rewards/gemini_judge_reward_func/mean": 0.1238839253783226, "rewards/gemini_judge_reward_func/std": 0.2615041136741638, "rewards/semantic_correctness_reward_func/mean": 0.4436066150665283, "rewards/semantic_correctness_reward_func/std": 0.20674681663513184, "rewards/xmlcount_reward_func/mean": 0.7814866304397583, "rewards/xmlcount_reward_func/std": 0.41452744603157043, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 155.80357360839844, "completions/mean_terminated_length": 140.01817321777344, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.11676838376509752, "grad_norm": 0.020520929247140884, "kl": 0.01617908477783203, "learning_rate": 8.057998594759022e-06, "loss": -0.0048, "num_tokens": 121980211.0, "reward": 0.4046533703804016, "reward_std": 0.05556685850024223, "rewards/gemini_judge_reward_func/mean": 0.0993303582072258, "rewards/gemini_judge_reward_func/std": 0.2317548543214798, "rewards/semantic_correctness_reward_func/mean": 0.43035584688186646, "rewards/semantic_correctness_reward_func/std": 0.18309862911701202, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903892993927, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 161.95089721679688, "completions/mean_terminated_length": 134.14285278320312, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.11710981178780248, "grad_norm": 0.02210637740790844, "kl": 0.017457008361816406, "learning_rate": 8.044900184287007e-06, "loss": -0.0198, "num_tokens": 122351324.0, "reward": 0.39233168959617615, "reward_std": 0.05195912346243858, "rewards/gemini_judge_reward_func/mean": 0.1395089328289032, "rewards/gemini_judge_reward_func/std": 0.268510639667511, "rewards/semantic_correctness_reward_func/mean": 0.4501582682132721, "rewards/semantic_correctness_reward_func/std": 0.2134057879447937, "rewards/xmlcount_reward_func/mean": 0.6162410378456116, "rewards/xmlcount_reward_func/std": 0.4834427833557129, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 156.8303680419922, "completions/mean_terminated_length": 149.0180206298828, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.11745123981050745, "grad_norm": 0.02535112388432026, "kl": 0.014339447021484375, "learning_rate": 8.031768475274412e-06, "loss": -0.0396, "num_tokens": 122719626.0, "reward": 0.49485448002815247, "reward_std": 0.07723495364189148, "rewards/gemini_judge_reward_func/mean": 0.1908482164144516, "rewards/gemini_judge_reward_func/std": 0.29018884897232056, "rewards/semantic_correctness_reward_func/mean": 0.4480757713317871, "rewards/semantic_correctness_reward_func/std": 0.20120421051979065, "rewards/xmlcount_reward_func/mean": 0.8222500681877136, "rewards/xmlcount_reward_func/std": 0.3842346966266632, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 144.80804443359375, "completions/mean_terminated_length": 136.88739013671875, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.1177926678332124, "grad_norm": 0.023620422929525375, "kl": 0.01935100555419922, "learning_rate": 8.018603611327505e-06, "loss": 0.0273, "num_tokens": 123068043.0, "reward": 0.436502605676651, "reward_std": 0.06161291524767876, "rewards/gemini_judge_reward_func/mean": 0.1417410671710968, "rewards/gemini_judge_reward_func/std": 0.2755933105945587, "rewards/semantic_correctness_reward_func/mean": 0.4363076984882355, "rewards/semantic_correctness_reward_func/std": 0.21314512193202972, "rewards/xmlcount_reward_func/mean": 0.7313615679740906, "rewards/xmlcount_reward_func/std": 0.4439382255077362, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 146.9107208251953, "completions/mean_terminated_length": 139.00901794433594, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.11813409585591737, "grad_norm": 0.023379402235150337, "kl": 0.020755767822265625, "learning_rate": 8.005405736415127e-06, "loss": 0.0179, "num_tokens": 123455955.0, "reward": 0.4183342158794403, "reward_std": 0.06782528012990952, "rewards/gemini_judge_reward_func/mean": 0.1227678582072258, "rewards/gemini_judge_reward_func/std": 0.22830908000469208, "rewards/semantic_correctness_reward_func/mean": 0.4518852233886719, "rewards/semantic_correctness_reward_func/std": 0.18604160845279694, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903892993927, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 150.29464721679688, "completions/mean_terminated_length": 142.42343139648438, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.11847552387862234, "grad_norm": 0.021145131438970566, "kl": 0.017850875854492188, "learning_rate": 7.992174994867124e-06, "loss": -0.0141, "num_tokens": 123812385.0, "reward": 0.4235004186630249, "reward_std": 0.07467382401227951, "rewards/gemini_judge_reward_func/mean": 0.1316964328289032, "rewards/gemini_judge_reward_func/std": 0.26034605503082275, "rewards/semantic_correctness_reward_func/mean": 0.4553859531879425, "rewards/semantic_correctness_reward_func/std": 0.19646060466766357, "rewards/xmlcount_reward_func/mean": 0.6993616819381714, "rewards/xmlcount_reward_func/std": 0.4591045677661896, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 137.13394165039062, "completions/mean_terminated_length": 137.13394165039062, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.1188169519013273, "grad_norm": 0.023312104865908623, "kl": 0.018787860870361328, "learning_rate": 7.978911531372764e-06, "loss": 0.003, "num_tokens": 124133127.0, "reward": 0.4589785635471344, "reward_std": 0.06582622230052948, "rewards/gemini_judge_reward_func/mean": 0.1283482164144516, "rewards/gemini_judge_reward_func/std": 0.2379140406847, "rewards/semantic_correctness_reward_func/mean": 0.4205087721347809, "rewards/semantic_correctness_reward_func/std": 0.19802919030189514, "rewards/xmlcount_reward_func/mean": 0.8088437914848328, "rewards/xmlcount_reward_func/std": 0.3951219618320465, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 709.0, "completions/mean_length": 148.7366180419922, "completions/mean_terminated_length": 140.85134887695312, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.11915837992403226, "grad_norm": 0.0231174249202013, "kl": 0.025996685028076172, "learning_rate": 7.965615490979165e-06, "loss": -0.0209, "num_tokens": 124506008.0, "reward": 0.42070716619491577, "reward_std": 0.054260022938251495, "rewards/gemini_judge_reward_func/mean": 0.1071428582072258, "rewards/gemini_judge_reward_func/std": 0.22137534618377686, "rewards/semantic_correctness_reward_func/mean": 0.4056251347064972, "rewards/semantic_correctness_reward_func/std": 0.1975453943014145, "rewards/xmlcount_reward_func/mean": 0.7418125867843628, "rewards/xmlcount_reward_func/std": 0.4394664168357849, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 162.75, "completions/mean_terminated_length": 143.0867462158203, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.11949980794673723, "grad_norm": 0.019620204344391823, "kl": 0.014969110488891602, "learning_rate": 7.952287019089686e-06, "loss": -0.0175, "num_tokens": 124862112.0, "reward": 0.4290351867675781, "reward_std": 0.059172313660383224, "rewards/gemini_judge_reward_func/mean": 0.1149553582072258, "rewards/gemini_judge_reward_func/std": 0.21019315719604492, "rewards/semantic_correctness_reward_func/mean": 0.3958899676799774, "rewards/semantic_correctness_reward_func/std": 0.21406039595603943, "rewards/xmlcount_reward_func/mean": 0.7596875429153442, "rewards/xmlcount_reward_func/std": 0.4264892339706421, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 180.13394165039062, "completions/mean_terminated_length": 156.9082489013672, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.1198412359694422, "grad_norm": 0.02199356071650982, "kl": 0.016332149505615234, "learning_rate": 7.938926261462366e-06, "loss": 0.0142, "num_tokens": 125222638.0, "reward": 0.39731401205062866, "reward_std": 0.04134167358279228, "rewards/gemini_judge_reward_func/mean": 0.0758928582072258, "rewards/gemini_judge_reward_func/std": 0.18289919197559357, "rewards/semantic_correctness_reward_func/mean": 0.40478435158729553, "rewards/semantic_correctness_reward_func/std": 0.18352636694908142, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 149.4375, "completions/mean_terminated_length": 141.55856323242188, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.12018266399214715, "grad_norm": 0.023317914456129074, "kl": 0.017638683319091797, "learning_rate": 7.925533364208308e-06, "loss": 0.0142, "num_tokens": 125537380.0, "reward": 0.4412147104740143, "reward_std": 0.07194562256336212, "rewards/gemini_judge_reward_func/mean": 0.1171875, "rewards/gemini_judge_reward_func/std": 0.21941961348056793, "rewards/semantic_correctness_reward_func/mean": 0.4344485104084015, "rewards/semantic_correctness_reward_func/std": 0.2174639254808426, "rewards/xmlcount_reward_func/mean": 0.7686249613761902, "rewards/xmlcount_reward_func/std": 0.42356839776039124, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 157.3303680419922, "completions/mean_terminated_length": 153.44395446777344, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.12052409201485212, "grad_norm": 0.02054346166551113, "kl": 0.013754844665527344, "learning_rate": 7.912108473790092e-06, "loss": 0.0265, "num_tokens": 125890782.0, "reward": 0.4809158742427826, "reward_std": 0.07998733222484589, "rewards/gemini_judge_reward_func/mean": 0.1685267835855484, "rewards/gemini_judge_reward_func/std": 0.26428356766700745, "rewards/semantic_correctness_reward_func/mean": 0.4587755799293518, "rewards/semantic_correctness_reward_func/std": 0.20430655777454376, "rewards/xmlcount_reward_func/mean": 0.8043751120567322, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 151.66964721679688, "completions/mean_terminated_length": 147.7578582763672, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.12086552003755709, "grad_norm": 0.019831150770187378, "kl": 0.013668537139892578, "learning_rate": 7.898651737020166e-06, "loss": -0.0238, "num_tokens": 126230728.0, "reward": 0.4762882590293884, "reward_std": 0.07177340984344482, "rewards/gemini_judge_reward_func/mean": 0.1395089328289032, "rewards/gemini_judge_reward_func/std": 0.24901461601257324, "rewards/semantic_correctness_reward_func/mean": 0.45792320370674133, "rewards/semantic_correctness_reward_func/std": 0.21523572504520416, "rewards/xmlcount_reward_func/mean": 0.8222500681877136, "rewards/xmlcount_reward_func/std": 0.3842346966266632, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 1024.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 184.16964721679688, "completions/mean_terminated_length": 153.0648193359375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.12120694806026204, "grad_norm": 0.02033446729183197, "kl": 0.013987541198730469, "learning_rate": 7.885163301059251e-06, "loss": 0.0056, "num_tokens": 126579194.0, "reward": 0.4241471588611603, "reward_std": 0.045106999576091766, "rewards/gemini_judge_reward_func/mean": 0.0814732164144516, "rewards/gemini_judge_reward_func/std": 0.2058626413345337, "rewards/semantic_correctness_reward_func/mean": 0.3758426010608673, "rewards/semantic_correctness_reward_func/std": 0.1890416443347931, "rewards/xmlcount_reward_func/mean": 0.7909732460975647, "rewards/xmlcount_reward_func/std": 0.4057386815547943, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 148.33482360839844, "completions/mean_terminated_length": 140.44595336914062, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.12154837608296701, "grad_norm": 0.022476162761449814, "kl": 0.015032291412353516, "learning_rate": 7.871643313414718e-06, "loss": -0.0142, "num_tokens": 126935869.0, "reward": 0.4261614680290222, "reward_std": 0.0577315129339695, "rewards/gemini_judge_reward_func/mean": 0.1026785746216774, "rewards/gemini_judge_reward_func/std": 0.22223277390003204, "rewards/semantic_correctness_reward_func/mean": 0.43737873435020447, "rewards/semantic_correctness_reward_func/std": 0.19673730432987213, "rewards/xmlcount_reward_func/mean": 0.7440357208251953, "rewards/xmlcount_reward_func/std": 0.43435025215148926, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 148.95089721679688, "completions/mean_terminated_length": 141.06756591796875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.12188980410567198, "grad_norm": 0.023637736216187477, "kl": 0.016416549682617188, "learning_rate": 7.858091921938989e-06, "loss": -0.0125, "num_tokens": 127282282.0, "reward": 0.4587040841579437, "reward_std": 0.07693413645029068, "rewards/gemini_judge_reward_func/mean": 0.1763392835855484, "rewards/gemini_judge_reward_func/std": 0.3062107563018799, "rewards/semantic_correctness_reward_func/mean": 0.47061866521835327, "rewards/semantic_correctness_reward_func/std": 0.21844151616096497, "rewards/xmlcount_reward_func/mean": 0.735111653804779, "rewards/xmlcount_reward_func/std": 0.4418267011642456, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 151.4553680419922, "completions/mean_terminated_length": 143.5946044921875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.12223123212837694, "grad_norm": 0.023121589794754982, "kl": 0.017974853515625, "learning_rate": 7.844509274827907e-06, "loss": 0.0005, "num_tokens": 127638940.0, "reward": 0.41608354449272156, "reward_std": 0.05893407762050629, "rewards/gemini_judge_reward_func/mean": 0.0926339253783226, "rewards/gemini_judge_reward_func/std": 0.2171497493982315, "rewards/semantic_correctness_reward_func/mean": 0.4293998181819916, "rewards/semantic_correctness_reward_func/std": 0.20812109112739563, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 159.6428680419922, "completions/mean_terminated_length": 147.90951538085938, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.1225726601510819, "grad_norm": 0.02072990871965885, "kl": 0.015190601348876953, "learning_rate": 7.830895520619129e-06, "loss": -0.0122, "num_tokens": 127996884.0, "reward": 0.4240740239620209, "reward_std": 0.06780924648046494, "rewards/gemini_judge_reward_func/mean": 0.1071428582072258, "rewards/gemini_judge_reward_func/std": 0.2175440937280655, "rewards/semantic_correctness_reward_func/mean": 0.40458425879478455, "rewards/semantic_correctness_reward_func/std": 0.21169018745422363, "rewards/xmlcount_reward_func/mean": 0.7507500648498535, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 158.33929443359375, "completions/mean_terminated_length": 142.59999084472656, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.12291408817378686, "grad_norm": 0.02120456099510193, "kl": 0.01677846908569336, "learning_rate": 7.817250808190483e-06, "loss": -0.0013, "num_tokens": 128351412.0, "reward": 0.419956237077713, "reward_std": 0.06337090581655502, "rewards/gemini_judge_reward_func/mean": 0.1127232164144516, "rewards/gemini_judge_reward_func/std": 0.24459390342235565, "rewards/semantic_correctness_reward_func/mean": 0.4085846543312073, "rewards/semantic_correctness_reward_func/std": 0.20613060891628265, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 1024.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 169.60714721679688, "completions/mean_terminated_length": 137.9629669189453, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.12325551619649183, "grad_norm": 0.022048698738217354, "kl": 0.018438339233398438, "learning_rate": 7.803575286758365e-06, "loss": -0.0032, "num_tokens": 128731148.0, "reward": 0.41158369183540344, "reward_std": 0.06956712901592255, "rewards/gemini_judge_reward_func/mean": 0.1439732164144516, "rewards/gemini_judge_reward_func/std": 0.2618865966796875, "rewards/semantic_correctness_reward_func/mean": 0.4472218155860901, "rewards/semantic_correctness_reward_func/std": 0.21267808973789215, "rewards/xmlcount_reward_func/mean": 0.6613750457763672, "rewards/xmlcount_reward_func/std": 0.47500187158584595, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 168.65625, "completions/mean_terminated_length": 153.1045379638672, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.12359694421919679, "grad_norm": 0.02125149965286255, "kl": 0.016147613525390625, "learning_rate": 7.789869105876083e-06, "loss": -0.0622, "num_tokens": 129093431.0, "reward": 0.43265652656555176, "reward_std": 0.07214810699224472, "rewards/gemini_judge_reward_func/mean": 0.1395089328289032, "rewards/gemini_judge_reward_func/std": 0.2621731162071228, "rewards/semantic_correctness_reward_func/mean": 0.4043269753456116, "rewards/semantic_correctness_reward_func/std": 0.2230450063943863, "rewards/xmlcount_reward_func/mean": 0.7399688363075256, "rewards/xmlcount_reward_func/std": 0.43580862879753113, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 158.22769165039062, "completions/mean_terminated_length": 146.47511291503906, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.12393837224190175, "grad_norm": 0.021313535049557686, "kl": 0.01935601234436035, "learning_rate": 7.776132415432234e-06, "loss": -0.0092, "num_tokens": 129453758.0, "reward": 0.4301838278770447, "reward_std": 0.0693366751074791, "rewards/gemini_judge_reward_func/mean": 0.1886160671710968, "rewards/gemini_judge_reward_func/std": 0.30937135219573975, "rewards/semantic_correctness_reward_func/mean": 0.4406421482563019, "rewards/semantic_correctness_reward_func/std": 0.23391857743263245, "rewards/xmlcount_reward_func/mean": 0.6665223836898804, "rewards/xmlcount_reward_func/std": 0.46999096870422363, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 158.55357360839844, "completions/mean_terminated_length": 134.7339324951172, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.12427980026460672, "grad_norm": 0.02203877456486225, "kl": 0.02167510986328125, "learning_rate": 7.762365365649068e-06, "loss": -0.0115, "num_tokens": 129829190.0, "reward": 0.3783107399940491, "reward_std": 0.055117614567279816, "rewards/gemini_judge_reward_func/mean": 0.0959821417927742, "rewards/gemini_judge_reward_func/std": 0.22271645069122314, "rewards/semantic_correctness_reward_func/mean": 0.44833922386169434, "rewards/semantic_correctness_reward_func/std": 0.17805521190166473, "rewards/xmlcount_reward_func/mean": 0.6256250143051147, "rewards/xmlcount_reward_func/std": 0.48569241166114807, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 154.47769165039062, "completions/mean_terminated_length": 150.57847595214844, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.12462122828731168, "grad_norm": 0.021525070071220398, "kl": 0.015169620513916016, "learning_rate": 7.748568107080831e-06, "loss": -0.015, "num_tokens": 130180113.0, "reward": 0.4638497829437256, "reward_std": 0.05021931231021881, "rewards/gemini_judge_reward_func/mean": 0.1004464253783226, "rewards/gemini_judge_reward_func/std": 0.22575192153453827, "rewards/semantic_correctness_reward_func/mean": 0.40235573053359985, "rewards/semantic_correctness_reward_func/std": 0.215322345495224, "rewards/xmlcount_reward_func/mean": 0.8580000996589661, "rewards/xmlcount_reward_func/std": 0.35106155276298523, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 161.45089721679688, "completions/mean_terminated_length": 141.7579803466797, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.12496265631001664, "grad_norm": 0.020651815459132195, "kl": 0.01579141616821289, "learning_rate": 7.734740790612137e-06, "loss": -0.0091, "num_tokens": 130554842.0, "reward": 0.37947192788124084, "reward_std": 0.05589864403009415, "rewards/gemini_judge_reward_func/mean": 0.0758928582072258, "rewards/gemini_judge_reward_func/std": 0.1798083484172821, "rewards/semantic_correctness_reward_func/mean": 0.3870737552642822, "rewards/semantic_correctness_reward_func/std": 0.18478237092494965, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 158.6116180419922, "completions/mean_terminated_length": 154.73095703125, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.1253040843327216, "grad_norm": 0.019816717132925987, "kl": 0.016491413116455078, "learning_rate": 7.720883567456299e-06, "loss": -0.0226, "num_tokens": 130900519.0, "reward": 0.43501195311546326, "reward_std": 0.06687616556882858, "rewards/gemini_judge_reward_func/mean": 0.1383928507566452, "rewards/gemini_judge_reward_func/std": 0.24338625371456146, "rewards/semantic_correctness_reward_func/mean": 0.43252378702163696, "rewards/semantic_correctness_reward_func/std": 0.20659232139587402, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 151.45982360839844, "completions/mean_terminated_length": 143.59910583496094, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.12564551235542656, "grad_norm": 0.02021671086549759, "kl": 0.017315387725830078, "learning_rate": 7.70699658915369e-06, "loss": -0.0132, "num_tokens": 131242978.0, "reward": 0.45236364006996155, "reward_std": 0.07268624007701874, "rewards/gemini_judge_reward_func/mean": 0.1685267835855484, "rewards/gemini_judge_reward_func/std": 0.292473703622818, "rewards/semantic_correctness_reward_func/mean": 0.4635234475135803, "rewards/semantic_correctness_reward_func/std": 0.20471033453941345, "rewards/xmlcount_reward_func/mean": 0.7306205630302429, "rewards/xmlcount_reward_func/std": 0.4441879093647003, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 167.49554443359375, "completions/mean_terminated_length": 151.9227294921875, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.12598694037813155, "grad_norm": 0.023807330057024956, "kl": 0.01745128631591797, "learning_rate": 7.693080007570084e-06, "loss": 0.0224, "num_tokens": 131625237.0, "reward": 0.4182495176792145, "reward_std": 0.07036899775266647, "rewards/gemini_judge_reward_func/mean": 0.1551339328289032, "rewards/gemini_judge_reward_func/std": 0.26403728127479553, "rewards/semantic_correctness_reward_func/mean": 0.4224795699119568, "rewards/semantic_correctness_reward_func/std": 0.2183416783809662, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 148.90179443359375, "completions/mean_terminated_length": 141.0180206298828, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.1263283684008365, "grad_norm": 0.02168549969792366, "kl": 0.016880512237548828, "learning_rate": 7.679133974894984e-06, "loss": -0.0272, "num_tokens": 131959631.0, "reward": 0.42804205417633057, "reward_std": 0.05581650137901306, "rewards/gemini_judge_reward_func/mean": 0.1104910746216774, "rewards/gemini_judge_reward_func/std": 0.24675332009792328, "rewards/semantic_correctness_reward_func/mean": 0.4177280068397522, "rewards/semantic_correctness_reward_func/std": 0.20709756016731262, "rewards/xmlcount_reward_func/mean": 0.7507500648498535, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 773.0, "completions/mean_length": 156.0357208251953, "completions/mean_terminated_length": 148.21621704101562, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.12666979642354145, "grad_norm": 0.02178873121738434, "kl": 0.01481771469116211, "learning_rate": 7.66515864363997e-06, "loss": -0.0008, "num_tokens": 132312491.0, "reward": 0.475494921207428, "reward_std": 0.0693972259759903, "rewards/gemini_judge_reward_func/mean": 0.1662946492433548, "rewards/gemini_judge_reward_func/std": 0.2516138553619385, "rewards/semantic_correctness_reward_func/mean": 0.47188514471054077, "rewards/semantic_correctness_reward_func/std": 0.2055116444826126, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 148.68304443359375, "completions/mean_terminated_length": 140.79730224609375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.12701122444624643, "grad_norm": 0.021552162244915962, "kl": 0.01751708984375, "learning_rate": 7.651154166637025e-06, "loss": 0.0188, "num_tokens": 132678580.0, "reward": 0.4199696183204651, "reward_std": 0.059298258274793625, "rewards/gemini_judge_reward_func/mean": 0.1171875, "rewards/gemini_judge_reward_func/std": 0.23184122145175934, "rewards/semantic_correctness_reward_func/mean": 0.44441038370132446, "rewards/semantic_correctness_reward_func/std": 0.20885135233402252, "rewards/xmlcount_reward_func/mean": 0.7105312943458557, "rewards/xmlcount_reward_func/std": 0.4553159773349762, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 160.8169708251953, "completions/mean_terminated_length": 156.94619750976562, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.1273526524689514, "grad_norm": 0.021256279200315475, "kl": 0.01796579360961914, "learning_rate": 7.637120697036866e-06, "loss": 0.0054, "num_tokens": 133042543.0, "reward": 0.40777140855789185, "reward_std": 0.0612851157784462, "rewards/gemini_judge_reward_func/mean": 0.1037946417927742, "rewards/gemini_judge_reward_func/std": 0.20393303036689758, "rewards/semantic_correctness_reward_func/mean": 0.4012675881385803, "rewards/semantic_correctness_reward_func/std": 0.19751423597335815, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 180.0982208251953, "completions/mean_terminated_length": 148.84259033203125, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.12769408049165634, "grad_norm": 0.021484747529029846, "kl": 0.015058517456054688, "learning_rate": 7.62305838830727e-06, "loss": -0.019, "num_tokens": 133408461.0, "reward": 0.3818875849246979, "reward_std": 0.05296236649155617, "rewards/gemini_judge_reward_func/mean": 0.0870535746216774, "rewards/gemini_judge_reward_func/std": 0.20286573469638824, "rewards/semantic_correctness_reward_func/mean": 0.3946877419948578, "rewards/semantic_correctness_reward_func/std": 0.17419084906578064, "rewards/xmlcount_reward_func/mean": 0.6703214645385742, "rewards/xmlcount_reward_func/std": 0.4670778214931488, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 148.1116180419922, "completions/mean_terminated_length": 144.18386840820312, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.12803550851436132, "grad_norm": 0.021499551832675934, "kl": 0.018602371215820312, "learning_rate": 7.608967394231387e-06, "loss": -0.0345, "num_tokens": 133775478.0, "reward": 0.41469037532806396, "reward_std": 0.04859733209013939, "rewards/gemini_judge_reward_func/mean": 0.125, "rewards/gemini_judge_reward_func/std": 0.2426035851240158, "rewards/semantic_correctness_reward_func/mean": 0.42921966314315796, "rewards/semantic_correctness_reward_func/std": 0.20824337005615234, "rewards/xmlcount_reward_func/mean": 0.6971160769462585, "rewards/xmlcount_reward_func/std": 0.46128448843955994, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 160.82589721679688, "completions/mean_terminated_length": 141.11871337890625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.12837693653706628, "grad_norm": 0.021708445623517036, "kl": 0.017911672592163086, "learning_rate": 7.594847868906076e-06, "loss": -0.0095, "num_tokens": 134139351.0, "reward": 0.42823106050491333, "reward_std": 0.06395815312862396, "rewards/gemini_judge_reward_func/mean": 0.1361607164144516, "rewards/gemini_judge_reward_func/std": 0.26552340388298035, "rewards/semantic_correctness_reward_func/mean": 0.43883365392684937, "rewards/semantic_correctness_reward_func/std": 0.21176576614379883, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 162.5357208251953, "completions/mean_terminated_length": 146.8727264404297, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.12871836455977123, "grad_norm": 0.020864030346274376, "kl": 0.01667308807373047, "learning_rate": 7.580699966740201e-06, "loss": 0.0403, "num_tokens": 134527599.0, "reward": 0.4344416558742523, "reward_std": 0.07495336979627609, "rewards/gemini_judge_reward_func/mean": 0.1808035671710968, "rewards/gemini_judge_reward_func/std": 0.2884351909160614, "rewards/semantic_correctness_reward_func/mean": 0.4521009027957916, "rewards/semantic_correctness_reward_func/std": 0.21390804648399353, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 176.68751525878906, "completions/mean_terminated_length": 157.3424530029297, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1290597925824762, "grad_norm": 0.01891087181866169, "kl": 0.016262054443359375, "learning_rate": 7.566523842452958e-06, "loss": -0.0123, "num_tokens": 134889049.0, "reward": 0.4385979175567627, "reward_std": 0.0712980329990387, "rewards/gemini_judge_reward_func/mean": 0.1216517835855484, "rewards/gemini_judge_reward_func/std": 0.2482900321483612, "rewards/semantic_correctness_reward_func/mean": 0.44818589091300964, "rewards/semantic_correctness_reward_func/std": 0.20277683436870575, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 155.92857360839844, "completions/mean_terminated_length": 144.14480590820312, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.12940122060518117, "grad_norm": 0.02066349983215332, "kl": 0.013991832733154297, "learning_rate": 7.552319651072164e-06, "loss": -0.0142, "num_tokens": 135215325.0, "reward": 0.4548727571964264, "reward_std": 0.058319687843322754, "rewards/gemini_judge_reward_func/mean": 0.1037946417927742, "rewards/gemini_judge_reward_func/std": 0.22107842564582825, "rewards/semantic_correctness_reward_func/mean": 0.4222742021083832, "rewards/semantic_correctness_reward_func/std": 0.19840273261070251, "rewards/xmlcount_reward_func/mean": 0.8222500681877136, "rewards/xmlcount_reward_func/std": 0.3842346966266632, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 163.63394165039062, "completions/mean_terminated_length": 147.99090576171875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.12974264862788612, "grad_norm": 0.019953692331910133, "kl": 0.016832828521728516, "learning_rate": 7.5380875479325855e-06, "loss": -0.0387, "num_tokens": 135580519.0, "reward": 0.47378772497177124, "reward_std": 0.06329935044050217, "rewards/gemini_judge_reward_func/mean": 0.1830357164144516, "rewards/gemini_judge_reward_func/std": 0.2889639437198639, "rewards/semantic_correctness_reward_func/mean": 0.4656168818473816, "rewards/semantic_correctness_reward_func/std": 0.25280624628067017, "rewards/xmlcount_reward_func/mean": 0.7686249613761902, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 156.5669708251953, "completions/mean_terminated_length": 152.67713928222656, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1300840766505911, "grad_norm": 0.01964419148862362, "kl": 0.017740726470947266, "learning_rate": 7.52382768867422e-06, "loss": 0.007, "num_tokens": 135927274.0, "reward": 0.4654104709625244, "reward_std": 0.05828892067074776, "rewards/gemini_judge_reward_func/mean": 0.1149553582072258, "rewards/gemini_judge_reward_func/std": 0.219328373670578, "rewards/semantic_correctness_reward_func/mean": 0.4302842915058136, "rewards/semantic_correctness_reward_func/std": 0.21055085957050323, "rewards/xmlcount_reward_func/mean": 0.833428680896759, "rewards/xmlcount_reward_func/std": 0.37002047896385193, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 148.58929443359375, "completions/mean_terminated_length": 136.7058868408203, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.13042550467329606, "grad_norm": 0.019918564707040787, "kl": 0.01675271987915039, "learning_rate": 7.509540229240601e-06, "loss": -0.0218, "num_tokens": 136290766.0, "reward": 0.3735648989677429, "reward_std": 0.05948694050312042, "rewards/gemini_judge_reward_func/mean": 0.1060267835855484, "rewards/gemini_judge_reward_func/std": 0.21485590934753418, "rewards/semantic_correctness_reward_func/mean": 0.44027090072631836, "rewards/semantic_correctness_reward_func/std": 0.20193372666835785, "rewards/xmlcount_reward_func/mean": 0.6077500581741333, "rewards/xmlcount_reward_func/std": 0.48996880650520325, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 151.4241180419922, "completions/mean_terminated_length": 143.5630645751953, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.13076693269600104, "grad_norm": 0.02249898388981819, "kl": 0.019742965698242188, "learning_rate": 7.4952253258771036e-06, "loss": 0.0399, "num_tokens": 136652045.0, "reward": 0.436825156211853, "reward_std": 0.056539103388786316, "rewards/gemini_judge_reward_func/mean": 0.1685267835855484, "rewards/gemini_judge_reward_func/std": 0.29533451795578003, "rewards/semantic_correctness_reward_func/mean": 0.4528220593929291, "rewards/semantic_correctness_reward_func/std": 0.21148590743541718, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903594970703, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 161.2857208251953, "completions/mean_terminated_length": 153.51351928710938, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.131108360718706, "grad_norm": 0.02023939974606037, "kl": 0.015448570251464844, "learning_rate": 7.480883135129211e-06, "loss": -0.0074, "num_tokens": 137023725.0, "reward": 0.4137805700302124, "reward_std": 0.05417332798242569, "rewards/gemini_judge_reward_func/mean": 0.0926339253783226, "rewards/gemini_judge_reward_func/std": 0.18509900569915771, "rewards/semantic_correctness_reward_func/mean": 0.41341152787208557, "rewards/semantic_correctness_reward_func/std": 0.19928321242332458, "rewards/xmlcount_reward_func/mean": 0.735111653804779, "rewards/xmlcount_reward_func/std": 0.4418267011642456, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 145.07589721679688, "completions/mean_terminated_length": 141.13453674316406, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.13144978874141094, "grad_norm": 0.022299103438854218, "kl": 0.018901348114013672, "learning_rate": 7.4665138138408255e-06, "loss": 0.0139, "num_tokens": 137355434.0, "reward": 0.4558815360069275, "reward_std": 0.07524207979440689, "rewards/gemini_judge_reward_func/mean": 0.1462053507566452, "rewards/gemini_judge_reward_func/std": 0.26490774750709534, "rewards/semantic_correctness_reward_func/mean": 0.45868438482284546, "rewards/semantic_correctness_reward_func/std": 0.23558257520198822, "rewards/xmlcount_reward_func/mean": 0.7641563415527344, "rewards/xmlcount_reward_func/std": 0.4263768792152405, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 160.26339721679688, "completions/mean_terminated_length": 148.53846740722656, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.13179121676411593, "grad_norm": 0.022259045392274857, "kl": 0.017290592193603516, "learning_rate": 7.452117519152542e-06, "loss": 0.0129, "num_tokens": 137707349.0, "reward": 0.4224461615085602, "reward_std": 0.05426723137497902, "rewards/gemini_judge_reward_func/mean": 0.0892857164144516, "rewards/gemini_judge_reward_func/std": 0.1782989650964737, "rewards/semantic_correctness_reward_func/mean": 0.39640921354293823, "rewards/semantic_correctness_reward_func/std": 0.18904563784599304, "rewards/xmlcount_reward_func/mean": 0.7686249613761902, "rewards/xmlcount_reward_func/std": 0.42356839776039124, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 156.45982360839844, "completions/mean_terminated_length": 144.68325805664062, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.13213264478682088, "grad_norm": 0.02163863182067871, "kl": 0.01602315902709961, "learning_rate": 7.437694408499932e-06, "loss": 0.0092, "num_tokens": 138051268.0, "reward": 0.43781083822250366, "reward_std": 0.05744494870305061, "rewards/gemini_judge_reward_func/mean": 0.1205357164144516, "rewards/gemini_judge_reward_func/std": 0.23313553631305695, "rewards/semantic_correctness_reward_func/mean": 0.4464828670024872, "rewards/semantic_correctness_reward_func/std": 0.19655469059944153, "rewards/xmlcount_reward_func/mean": 0.7507500648498535, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 152.40179443359375, "completions/mean_terminated_length": 144.549560546875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.13247407280952583, "grad_norm": 0.022210579365491867, "kl": 0.017316818237304688, "learning_rate": 7.4232446396118265e-06, "loss": -0.0113, "num_tokens": 138391914.0, "reward": 0.4606318771839142, "reward_std": 0.08661013096570969, "rewards/gemini_judge_reward_func/mean": 0.1529017835855484, "rewards/gemini_judge_reward_func/std": 0.2621540129184723, "rewards/semantic_correctness_reward_func/mean": 0.42435577511787415, "rewards/semantic_correctness_reward_func/std": 0.20760102570056915, "rewards/xmlcount_reward_func/mean": 0.786500096321106, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 149.21875, "completions/mean_terminated_length": 145.2959747314453, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.13281550083223082, "grad_norm": 0.021780794486403465, "kl": 0.016431808471679688, "learning_rate": 7.408768370508577e-06, "loss": -0.02, "num_tokens": 138750395.0, "reward": 0.42754751443862915, "reward_std": 0.05521192029118538, "rewards/gemini_judge_reward_func/mean": 0.0915178582072258, "rewards/gemini_judge_reward_func/std": 0.1980723738670349, "rewards/semantic_correctness_reward_func/mean": 0.4174516499042511, "rewards/semantic_correctness_reward_func/std": 0.20846490561962128, "rewards/xmlcount_reward_func/mean": 0.7686249613761902, "rewards/xmlcount_reward_func/std": 0.42356839776039124, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 161.19644165039062, "completions/mean_terminated_length": 145.5090789794922, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.13315692885493577, "grad_norm": 0.021414414048194885, "kl": 0.018945693969726562, "learning_rate": 7.394265759500348e-06, "loss": -0.0034, "num_tokens": 139114735.0, "reward": 0.41808784008026123, "reward_std": 0.06346622854471207, "rewards/gemini_judge_reward_func/mean": 0.1194196417927742, "rewards/gemini_judge_reward_func/std": 0.2319059669971466, "rewards/semantic_correctness_reward_func/mean": 0.4215996563434601, "rewards/semantic_correctness_reward_func/std": 0.21271347999572754, "rewards/xmlcount_reward_func/mean": 0.7150000929832458, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 167.0625, "completions/mean_terminated_length": 155.42987060546875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.13349835687764072, "grad_norm": 0.02051488868892193, "kl": 0.016121387481689453, "learning_rate": 7.379736965185369e-06, "loss": -0.0242, "num_tokens": 139485993.0, "reward": 0.44687801599502563, "reward_std": 0.055957481265068054, "rewards/gemini_judge_reward_func/mean": 0.0926339253783226, "rewards/gemini_judge_reward_func/std": 0.18204548954963684, "rewards/semantic_correctness_reward_func/mean": 0.4403719902038574, "rewards/semantic_correctness_reward_func/std": 0.19233807921409607, "rewards/xmlcount_reward_func/mean": 0.8043750524520874, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 150.4375, "completions/mean_terminated_length": 142.56756591796875, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.1338397849003457, "grad_norm": 0.025922337546944618, "kl": 0.018519878387451172, "learning_rate": 7.365182146448205e-06, "loss": -0.0162, "num_tokens": 139818011.0, "reward": 0.4373648762702942, "reward_std": 0.05904890224337578, "rewards/gemini_judge_reward_func/mean": 0.1316964328289032, "rewards/gemini_judge_reward_func/std": 0.23428121209144592, "rewards/semantic_correctness_reward_func/mean": 0.4219311773777008, "rewards/semantic_correctness_reward_func/std": 0.224158376455307, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 146.89732360839844, "completions/mean_terminated_length": 142.96412658691406, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.13418121292305066, "grad_norm": 0.021265504881739616, "kl": 0.016716480255126953, "learning_rate": 7.350601462458025e-06, "loss": -0.0039, "num_tokens": 140184364.0, "reward": 0.4281903803348541, "reward_std": 0.0652979239821434, "rewards/gemini_judge_reward_func/mean": 0.1138392835855484, "rewards/gemini_judge_reward_func/std": 0.2514272928237915, "rewards/semantic_correctness_reward_func/mean": 0.4117732644081116, "rewards/semantic_correctness_reward_func/std": 0.1989647001028061, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 159.44644165039062, "completions/mean_terminated_length": 143.72726440429688, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.1345226409457556, "grad_norm": 0.022965986281633377, "kl": 0.019566059112548828, "learning_rate": 7.335995072666848e-06, "loss": -0.0162, "num_tokens": 140515804.0, "reward": 0.4194537103176117, "reward_std": 0.04903746023774147, "rewards/gemini_judge_reward_func/mean": 0.109375, "rewards/gemini_judge_reward_func/std": 0.20716224610805511, "rewards/semantic_correctness_reward_func/mean": 0.41276854276657104, "rewards/semantic_correctness_reward_func/std": 0.18567071855068207, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 161.39732360839844, "completions/mean_terminated_length": 141.70318603515625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.1348640689684606, "grad_norm": 0.02179262414574623, "kl": 0.02160930633544922, "learning_rate": 7.3213631368078196e-06, "loss": -0.0011, "num_tokens": 140892281.0, "reward": 0.38270094990730286, "reward_std": 0.05515586584806442, "rewards/gemini_judge_reward_func/mean": 0.1272321492433548, "rewards/gemini_judge_reward_func/std": 0.26785045862197876, "rewards/semantic_correctness_reward_func/mean": 0.43906697630882263, "rewards/semantic_correctness_reward_func/std": 0.21814100444316864, "rewards/xmlcount_reward_func/mean": 0.6099866628646851, "rewards/xmlcount_reward_func/std": 0.4883228540420532, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 168.22769165039062, "completions/mean_terminated_length": 148.6894989013672, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.13520549699116555, "grad_norm": 0.021188482642173767, "kl": 0.012853145599365234, "learning_rate": 7.30670581489344e-06, "loss": -0.0209, "num_tokens": 141243504.0, "reward": 0.4657913148403168, "reward_std": 0.07301204651594162, "rewards/gemini_judge_reward_func/mean": 0.1551339328289032, "rewards/gemini_judge_reward_func/std": 0.3035410940647125, "rewards/semantic_correctness_reward_func/mean": 0.4099385440349579, "rewards/semantic_correctness_reward_func/std": 0.24554020166397095, "rewards/xmlcount_reward_func/mean": 0.8043751120567322, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 149.60269165039062, "completions/mean_terminated_length": 145.68162536621094, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.1355469250138705, "grad_norm": 0.021987369284033775, "kl": 0.018671512603759766, "learning_rate": 7.292023267213836e-06, "loss": -0.0344, "num_tokens": 141591435.0, "reward": 0.452779084444046, "reward_std": 0.06523489207029343, "rewards/gemini_judge_reward_func/mean": 0.1361607164144516, "rewards/gemini_judge_reward_func/std": 0.2388509213924408, "rewards/semantic_correctness_reward_func/mean": 0.45432382822036743, "rewards/semantic_correctness_reward_func/std": 0.19478566944599152, "rewards/xmlcount_reward_func/mean": 0.7686249613761902, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 152.66519165039062, "completions/mean_terminated_length": 144.81532287597656, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.13588835303657548, "grad_norm": 0.02154485322535038, "kl": 0.01664876937866211, "learning_rate": 7.2773156543349965e-06, "loss": 0.0017, "num_tokens": 141945044.0, "reward": 0.4172542691230774, "reward_std": 0.07213146984577179, "rewards/gemini_judge_reward_func/mean": 0.1238839253783226, "rewards/gemini_judge_reward_func/std": 0.2615041136741638, "rewards/semantic_correctness_reward_func/mean": 0.4085032045841217, "rewards/semantic_correctness_reward_func/std": 0.20544031262397766, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 151.21429443359375, "completions/mean_terminated_length": 139.36651611328125, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.13622978105928044, "grad_norm": 0.020291056483983994, "kl": 0.018874645233154297, "learning_rate": 7.262583137097019e-06, "loss": 0.0093, "num_tokens": 142311220.0, "reward": 0.44641584157943726, "reward_std": 0.0624687597155571, "rewards/gemini_judge_reward_func/mean": 0.140625, "rewards/gemini_judge_reward_func/std": 0.2599613070487976, "rewards/semantic_correctness_reward_func/mean": 0.4493291676044464, "rewards/semantic_correctness_reward_func/std": 0.2035626322031021, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0401785714285714, "completions/max_length": 1024.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 174.49554443359375, "completions/mean_terminated_length": 138.9348907470703, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.13657120908198542, "grad_norm": 0.02120777778327465, "kl": 0.021905899047851562, "learning_rate": 7.247825876612353e-06, "loss": -0.0367, "num_tokens": 142682655.0, "reward": 0.38896819949150085, "reward_std": 0.06701021641492844, "rewards/gemini_judge_reward_func/mean": 0.140625, "rewards/gemini_judge_reward_func/std": 0.257796049118042, "rewards/semantic_correctness_reward_func/mean": 0.45702823996543884, "rewards/semantic_correctness_reward_func/std": 0.19833700358867645, "rewards/xmlcount_reward_func/mean": 0.6032813191413879, "rewards/xmlcount_reward_func/std": 0.4909299612045288, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 171.41964721679688, "completions/mean_terminated_length": 159.84616088867188, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.13691263710469037, "grad_norm": 0.022050578147172928, "kl": 0.025957345962524414, "learning_rate": 7.233044034264034e-06, "loss": 0.0136, "num_tokens": 143029509.0, "reward": 0.4459562301635742, "reward_std": 0.06850647926330566, "rewards/gemini_judge_reward_func/mean": 0.1417410671710968, "rewards/gemini_judge_reward_func/std": 0.24202269315719604, "rewards/semantic_correctness_reward_func/mean": 0.44479867815971375, "rewards/semantic_correctness_reward_func/std": 0.20681588351726532, "rewards/xmlcount_reward_func/mean": 0.7507500648498535, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 159.51339721679688, "completions/mean_terminated_length": 139.7762451171875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.13725406512739532, "grad_norm": 0.02276882901787758, "kl": 0.028842449188232422, "learning_rate": 7.218237771703921e-06, "loss": -0.0022, "num_tokens": 143394860.0, "reward": 0.38758131861686707, "reward_std": 0.05766326189041138, "rewards/gemini_judge_reward_func/mean": 0.1171875, "rewards/gemini_judge_reward_func/std": 0.21161696314811707, "rewards/semantic_correctness_reward_func/mean": 0.4165315628051758, "rewards/semantic_correctness_reward_func/std": 0.19921529293060303, "rewards/xmlcount_reward_func/mean": 0.643500030040741, "rewards/xmlcount_reward_func/std": 0.48071083426475525, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 155.125, "completions/mean_terminated_length": 147.29730224609375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.1375954931501003, "grad_norm": 0.02148452028632164, "kl": 0.01662302017211914, "learning_rate": 7.203407250850929e-06, "loss": 0.0048, "num_tokens": 143717172.0, "reward": 0.4796033799648285, "reward_std": 0.08433418720960617, "rewards/gemini_judge_reward_func/mean": 0.1674107164144516, "rewards/gemini_judge_reward_func/std": 0.26340386271476746, "rewards/semantic_correctness_reward_func/mean": 0.4901953339576721, "rewards/semantic_correctness_reward_func/std": 0.21127957105636597, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 148.8303680419922, "completions/mean_terminated_length": 140.94595336914062, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.13793692117280526, "grad_norm": 0.020280931144952774, "kl": 0.016697406768798828, "learning_rate": 7.18855263388926e-06, "loss": 0.0017, "num_tokens": 144081082.0, "reward": 0.449037104845047, "reward_std": 0.06599867343902588, "rewards/gemini_judge_reward_func/mean": 0.1629464328289032, "rewards/gemini_judge_reward_func/std": 0.25325196981430054, "rewards/semantic_correctness_reward_func/mean": 0.4557745158672333, "rewards/semantic_correctness_reward_func/std": 0.19886869192123413, "rewards/xmlcount_reward_func/mean": 0.7317589521408081, "rewards/xmlcount_reward_func/std": 0.4439156949520111, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 165.49107360839844, "completions/mean_terminated_length": 145.89041137695312, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.13827834919551021, "grad_norm": 0.020092271268367767, "kl": 0.015337467193603516, "learning_rate": 7.173674083266624e-06, "loss": -0.024, "num_tokens": 144430580.0, "reward": 0.4049227833747864, "reward_std": 0.05651836097240448, "rewards/gemini_judge_reward_func/mean": 0.1138392835855484, "rewards/gemini_judge_reward_func/std": 0.2141004502773285, "rewards/semantic_correctness_reward_func/mean": 0.4339620769023895, "rewards/semantic_correctness_reward_func/std": 0.19319763779640198, "rewards/xmlcount_reward_func/mean": 0.6814866662025452, "rewards/xmlcount_reward_func/std": 0.46647319197654724, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 746.0, "completions/mean_length": 163.99107360839844, "completions/mean_terminated_length": 152.31674194335938, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.1386197772182152, "grad_norm": 0.021386247128248215, "kl": 0.019238710403442383, "learning_rate": 7.158771761692464e-06, "loss": -0.0153, "num_tokens": 144785990.0, "reward": 0.42315545678138733, "reward_std": 0.058707475662231445, "rewards/gemini_judge_reward_func/mean": 0.1205357164144516, "rewards/gemini_judge_reward_func/std": 0.25714486837387085, "rewards/semantic_correctness_reward_func/mean": 0.4447057247161865, "rewards/semantic_correctness_reward_func/std": 0.2132960706949234, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 158.23214721679688, "completions/mean_terminated_length": 146.47964477539062, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.13896120524092015, "grad_norm": 0.020573705434799194, "kl": 0.018062591552734375, "learning_rate": 7.143845832136188e-06, "loss": -0.0133, "num_tokens": 145152102.0, "reward": 0.4384137988090515, "reward_std": 0.07652968168258667, "rewards/gemini_judge_reward_func/mean": 0.1361607164144516, "rewards/gemini_judge_reward_func/std": 0.25474947690963745, "rewards/semantic_correctness_reward_func/mean": 0.46782782673835754, "rewards/semantic_correctness_reward_func/std": 0.21055324375629425, "rewards/xmlcount_reward_func/mean": 0.7259598970413208, "rewards/xmlcount_reward_func/std": 0.4518895447254181, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 179.15626525878906, "completions/mean_terminated_length": 155.90365600585938, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.1393026332636251, "grad_norm": 0.021146338433027267, "kl": 0.0170745849609375, "learning_rate": 7.128896457825364e-06, "loss": -0.0303, "num_tokens": 145496605.0, "reward": 0.4385666251182556, "reward_std": 0.054036956280469894, "rewards/gemini_judge_reward_func/mean": 0.1116071417927742, "rewards/gemini_judge_reward_func/std": 0.22669215500354767, "rewards/semantic_correctness_reward_func/mean": 0.43236854672431946, "rewards/semantic_correctness_reward_func/std": 0.21931228041648865, "rewards/xmlcount_reward_func/mean": 0.7686249613761902, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 171.75894165039062, "completions/mean_terminated_length": 152.30136108398438, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.13964406128633008, "grad_norm": 0.020863203331828117, "kl": 0.017315387725830078, "learning_rate": 7.113923802243957e-06, "loss": -0.0304, "num_tokens": 145874323.0, "reward": 0.4249069094657898, "reward_std": 0.06283921003341675, "rewards/gemini_judge_reward_func/mean": 0.1529017835855484, "rewards/gemini_judge_reward_func/std": 0.276716947555542, "rewards/semantic_correctness_reward_func/mean": 0.46023085713386536, "rewards/semantic_correctness_reward_func/std": 0.20974647998809814, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 163.0357208251953, "completions/mean_terminated_length": 143.3789825439453, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.13998548930903504, "grad_norm": 0.01941607892513275, "kl": 0.01681995391845703, "learning_rate": 7.098928029130529e-06, "loss": 0.0088, "num_tokens": 146230799.0, "reward": 0.3921462297439575, "reward_std": 0.06177087500691414, "rewards/gemini_judge_reward_func/mean": 0.1127232164144516, "rewards/gemini_judge_reward_func/std": 0.23643691837787628, "rewards/semantic_correctness_reward_func/mean": 0.44157034158706665, "rewards/semantic_correctness_reward_func/std": 0.1995597928762436, "rewards/xmlcount_reward_func/mean": 0.6468572020530701, "rewards/xmlcount_reward_func/std": 0.47763964533805847, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 152.8616180419922, "completions/mean_terminated_length": 137.02272033691406, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.14032691733174, "grad_norm": 0.02037540264427662, "kl": 0.020148277282714844, "learning_rate": 7.083909302476453e-06, "loss": -0.0013, "num_tokens": 146610804.0, "reward": 0.3764539062976837, "reward_std": 0.07178690284490585, "rewards/gemini_judge_reward_func/mean": 0.1283482164144516, "rewards/gemini_judge_reward_func/std": 0.25606971979141235, "rewards/semantic_correctness_reward_func/mean": 0.41007286310195923, "rewards/semantic_correctness_reward_func/std": 0.2045731246471405, "rewards/xmlcount_reward_func/mean": 0.6077500581741333, "rewards/xmlcount_reward_func/std": 0.48996880650520325, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 149.24554443359375, "completions/mean_terminated_length": 141.3648681640625, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.14066834535444497, "grad_norm": 0.02088170126080513, "kl": 0.015942096710205078, "learning_rate": 7.068867786524116e-06, "loss": -0.0028, "num_tokens": 146958603.0, "reward": 0.4193098247051239, "reward_std": 0.04512748494744301, "rewards/gemini_judge_reward_func/mean": 0.1037946417927742, "rewards/gemini_judge_reward_func/std": 0.22855830192565918, "rewards/semantic_correctness_reward_func/mean": 0.4232097566127777, "rewards/semantic_correctness_reward_func/std": 0.22072601318359375, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 153.6919708251953, "completions/mean_terminated_length": 149.78924560546875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.14100977337714993, "grad_norm": 0.020876459777355194, "kl": 0.020923137664794922, "learning_rate": 7.053803645765128e-06, "loss": 0.0084, "num_tokens": 147314350.0, "reward": 0.43098554015159607, "reward_std": 0.053628940135240555, "rewards/gemini_judge_reward_func/mean": 0.1618303507566452, "rewards/gemini_judge_reward_func/std": 0.2943499982357025, "rewards/semantic_correctness_reward_func/mean": 0.4861953556537628, "rewards/semantic_correctness_reward_func/std": 0.2150430679321289, "rewards/xmlcount_reward_func/mean": 0.67253577709198, "rewards/xmlcount_reward_func/std": 0.46746620535850525, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 156.25, "completions/mean_terminated_length": 148.43243408203125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.14135120139985488, "grad_norm": 0.022351212799549103, "kl": 0.01930093765258789, "learning_rate": 7.038717044938519e-06, "loss": -0.0056, "num_tokens": 147669662.0, "reward": 0.48152732849121094, "reward_std": 0.07369009405374527, "rewards/gemini_judge_reward_func/mean": 0.1830357164144516, "rewards/gemini_judge_reward_func/std": 0.3013090491294861, "rewards/semantic_correctness_reward_func/mean": 0.4685649275779724, "rewards/semantic_correctness_reward_func/std": 0.23053161799907684, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 161.6607208251953, "completions/mean_terminated_length": 141.97259521484375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.14169262942255986, "grad_norm": 0.023213328793644905, "kl": 0.02099323272705078, "learning_rate": 7.023608149028936e-06, "loss": -0.0126, "num_tokens": 148012226.0, "reward": 0.44395384192466736, "reward_std": 0.05156712979078293, "rewards/gemini_judge_reward_func/mean": 0.1607142835855484, "rewards/gemini_judge_reward_func/std": 0.28283706307411194, "rewards/semantic_correctness_reward_func/mean": 0.4683404564857483, "rewards/semantic_correctness_reward_func/std": 0.21465305984020233, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 149.7857208251953, "completions/mean_terminated_length": 141.909912109375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.14203405744526482, "grad_norm": 0.020580802112817764, "kl": 0.017251014709472656, "learning_rate": 7.008477123264849e-06, "loss": -0.0101, "num_tokens": 148353486.0, "reward": 0.480153888463974, "reward_std": 0.08174009621143341, "rewards/gemini_judge_reward_func/mean": 0.1796875, "rewards/gemini_judge_reward_func/std": 0.29536840319633484, "rewards/semantic_correctness_reward_func/mean": 0.5041443705558777, "rewards/semantic_correctness_reward_func/std": 0.2075480967760086, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 161.33929443359375, "completions/mean_terminated_length": 141.64382934570312, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.14237548546796977, "grad_norm": 0.02087719924747944, "kl": 0.020658493041992188, "learning_rate": 6.993324133116726e-06, "loss": -0.0102, "num_tokens": 148723782.0, "reward": 0.432353675365448, "reward_std": 0.05930045619606972, "rewards/gemini_judge_reward_func/mean": 0.1316964328289032, "rewards/gemini_judge_reward_func/std": 0.2667270004749298, "rewards/semantic_correctness_reward_func/mean": 0.4326254427433014, "rewards/semantic_correctness_reward_func/std": 0.2124425172805786, "rewards/xmlcount_reward_func/mean": 0.7328750491142273, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0401785714285714, "completions/max_length": 1024.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 184.37501525878906, "completions/mean_terminated_length": 149.2279052734375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.14271691349067475, "grad_norm": 0.02160128392279148, "kl": 0.01466989517211914, "learning_rate": 6.978149344295242e-06, "loss": 0.0137, "num_tokens": 149068342.0, "reward": 0.45865893363952637, "reward_std": 0.07876806706190109, "rewards/gemini_judge_reward_func/mean": 0.1473214328289032, "rewards/gemini_judge_reward_func/std": 0.2562088966369629, "rewards/semantic_correctness_reward_func/mean": 0.425651490688324, "rewards/semantic_correctness_reward_func/std": 0.21036946773529053, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 175.50894165039062, "completions/mean_terminated_length": 148.13824462890625, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.1430583415133797, "grad_norm": 0.032436758279800415, "kl": 0.022993087768554688, "learning_rate": 6.9629529227494575e-06, "loss": 0.0047, "num_tokens": 149440944.0, "reward": 0.4210297167301178, "reward_std": 0.06524720042943954, "rewards/gemini_judge_reward_func/mean": 0.1205357164144516, "rewards/gemini_judge_reward_func/std": 0.2646643817424774, "rewards/semantic_correctness_reward_func/mean": 0.41618412733078003, "rewards/semantic_correctness_reward_func/std": 0.21945932507514954, "rewards/xmlcount_reward_func/mean": 0.7239465117454529, "rewards/xmlcount_reward_func/std": 0.4438221752643585, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 182.75001525878906, "completions/mean_terminated_length": 151.59259033203125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.1433997695360847, "grad_norm": 0.019419504329562187, "kl": 0.013905525207519531, "learning_rate": 6.9477350346650016e-06, "loss": -0.0283, "num_tokens": 149800156.0, "reward": 0.4564591348171234, "reward_std": 0.06576818972826004, "rewards/gemini_judge_reward_func/mean": 0.1863839328289032, "rewards/gemini_judge_reward_func/std": 0.273295521736145, "rewards/semantic_correctness_reward_func/mean": 0.4437777101993561, "rewards/semantic_correctness_reward_func/std": 0.23594844341278076, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044642857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 190.56251525878906, "completions/mean_terminated_length": 151.6168212890625, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.14374119755878964, "grad_norm": 0.022768495604395866, "kl": 0.02614879608154297, "learning_rate": 6.932495846462262e-06, "loss": 0.0009, "num_tokens": 150198454.0, "reward": 0.3712509572505951, "reward_std": 0.06766778230667114, "rewards/gemini_judge_reward_func/mean": 0.1283482164144516, "rewards/gemini_judge_reward_func/std": 0.22829538583755493, "rewards/semantic_correctness_reward_func/mean": 0.4421386122703552, "rewards/semantic_correctness_reward_func/std": 0.18618284165859222, "rewards/xmlcount_reward_func/mean": 0.5787099003791809, "rewards/xmlcount_reward_func/std": 0.49204617738723755, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 160.7366180419922, "completions/mean_terminated_length": 149.0181121826172, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.1440826255814946, "grad_norm": 0.02022860199213028, "kl": 0.017331600189208984, "learning_rate": 6.9172355247945586e-06, "loss": -0.0178, "num_tokens": 150555467.0, "reward": 0.45713624358177185, "reward_std": 0.06998570263385773, "rewards/gemini_judge_reward_func/mean": 0.1529017835855484, "rewards/gemini_judge_reward_func/std": 0.2817355990409851, "rewards/semantic_correctness_reward_func/mean": 0.4426274299621582, "rewards/semantic_correctness_reward_func/std": 0.22578133642673492, "rewards/xmlcount_reward_func/mean": 0.7686249613761902, "rewards/xmlcount_reward_func/std": 0.42356839776039124, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 170.2366180419922, "completions/mean_terminated_length": 158.64706420898438, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.14442405360419958, "grad_norm": 0.019036274403333664, "kl": 0.019074440002441406, "learning_rate": 6.901954236546324e-06, "loss": 0.0253, "num_tokens": 150930284.0, "reward": 0.4317302107810974, "reward_std": 0.0508720763027668, "rewards/gemini_judge_reward_func/mean": 0.1305803507566452, "rewards/gemini_judge_reward_func/std": 0.22453762590885162, "rewards/semantic_correctness_reward_func/mean": 0.4317401349544525, "rewards/semantic_correctness_reward_func/std": 0.19227589666843414, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 1024.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 182.78126525878906, "completions/mean_terminated_length": 151.625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.14476548162690453, "grad_norm": 0.019461285322904587, "kl": 0.018743515014648438, "learning_rate": 6.88665214883128e-06, "loss": 0.0233, "num_tokens": 151300231.0, "reward": 0.41627252101898193, "reward_std": 0.07595758885145187, "rewards/gemini_judge_reward_func/mean": 0.1383928507566452, "rewards/gemini_judge_reward_func/std": 0.2579222023487091, "rewards/semantic_correctness_reward_func/mean": 0.4460768401622772, "rewards/semantic_correctness_reward_func/std": 0.19947165250778198, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 164.13839721679688, "completions/mean_terminated_length": 156.3918914794922, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.14510690964960948, "grad_norm": 0.01971801184117794, "kl": 0.01684427261352539, "learning_rate": 6.871329428990602e-06, "loss": -0.003, "num_tokens": 151681530.0, "reward": 0.4248278737068176, "reward_std": 0.06656418740749359, "rewards/gemini_judge_reward_func/mean": 0.1294642835855484, "rewards/gemini_judge_reward_func/std": 0.27094367146492004, "rewards/semantic_correctness_reward_func/mean": 0.41733548045158386, "rewards/semantic_correctness_reward_func/std": 0.21749421954154968, "rewards/xmlcount_reward_func/mean": 0.7239375710487366, "rewards/xmlcount_reward_func/std": 0.4488601088523865, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 167.9419708251953, "completions/mean_terminated_length": 156.32127380371094, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.14544833767231446, "grad_norm": 0.019281448796391487, "kl": 0.017251014709472656, "learning_rate": 6.855986244591104e-06, "loss": 0.0054, "num_tokens": 152040889.0, "reward": 0.45101961493492126, "reward_std": 0.07227209955453873, "rewards/gemini_judge_reward_func/mean": 0.1551339328289032, "rewards/gemini_judge_reward_func/std": 0.2805072069168091, "rewards/semantic_correctness_reward_func/mean": 0.46121397614479065, "rewards/semantic_correctness_reward_func/std": 0.2255532592535019, "rewards/xmlcount_reward_func/mean": 0.7418080568313599, "rewards/xmlcount_reward_func/std": 0.4394637644290924, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 164.71429443359375, "completions/mean_terminated_length": 153.04977416992188, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.14578976569501942, "grad_norm": 0.01867520622909069, "kl": 0.018479347229003906, "learning_rate": 6.840622763423391e-06, "loss": 0.0091, "num_tokens": 152378793.0, "reward": 0.4158693850040436, "reward_std": 0.05586162954568863, "rewards/gemini_judge_reward_func/mean": 0.1104910746216774, "rewards/gemini_judge_reward_func/std": 0.20993109047412872, "rewards/semantic_correctness_reward_func/mean": 0.4283645451068878, "rewards/semantic_correctness_reward_func/std": 0.20962867140769958, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 169.82589721679688, "completions/mean_terminated_length": 158.23077392578125, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.14613119371772437, "grad_norm": 0.02032027207314968, "kl": 0.017673969268798828, "learning_rate": 6.825239153500029e-06, "loss": -0.0278, "num_tokens": 152725418.0, "reward": 0.4743519127368927, "reward_std": 0.0713319256901741, "rewards/gemini_judge_reward_func/mean": 0.1417410671710968, "rewards/gemini_judge_reward_func/std": 0.26838013529777527, "rewards/semantic_correctness_reward_func/mean": 0.4437771439552307, "rewards/semantic_correctness_reward_func/std": 0.21585533022880554, "rewards/xmlcount_reward_func/mean": 0.8222500085830688, "rewards/xmlcount_reward_func/std": 0.3842346966266632, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 164.98214721679688, "completions/mean_terminated_length": 157.2432403564453, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.14647262174042935, "grad_norm": 0.02102663740515709, "kl": 0.017747879028320312, "learning_rate": 6.809835583053716e-06, "loss": -0.0108, "num_tokens": 153084938.0, "reward": 0.46583712100982666, "reward_std": 0.0666920468211174, "rewards/gemini_judge_reward_func/mean": 0.1283482164144516, "rewards/gemini_judge_reward_func/std": 0.2437330037355423, "rewards/semantic_correctness_reward_func/mean": 0.4279892146587372, "rewards/semantic_correctness_reward_func/std": 0.207007497549057, "rewards/xmlcount_reward_func/mean": 0.8222500681877136, "rewards/xmlcount_reward_func/std": 0.3842346966266632, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 183.44644165039062, "completions/mean_terminated_length": 160.31192016601562, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1468140497631343, "grad_norm": 0.019378382712602615, "kl": 0.017717361450195312, "learning_rate": 6.794412220535426e-06, "loss": -0.0137, "num_tokens": 153469126.0, "reward": 0.4423011839389801, "reward_std": 0.067063108086586, "rewards/gemini_judge_reward_func/mean": 0.1674107164144516, "rewards/gemini_judge_reward_func/std": 0.2878098487854004, "rewards/semantic_correctness_reward_func/mean": 0.46568432450294495, "rewards/semantic_correctness_reward_func/std": 0.2046249806880951, "rewards/xmlcount_reward_func/mean": 0.705500066280365, "rewards/xmlcount_reward_func/std": 0.45444735884666443, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0401785714285714, "completions/max_length": 1024.0, "completions/max_terminated_length": 746.0, "completions/mean_length": 191.10714721679688, "completions/mean_terminated_length": 156.2418670654297, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.14715547778583926, "grad_norm": 0.01916058361530304, "kl": 0.017314910888671875, "learning_rate": 6.778969234612583e-06, "loss": -0.0471, "num_tokens": 153827938.0, "reward": 0.4171583354473114, "reward_std": 0.05001484602689743, "rewards/gemini_judge_reward_func/mean": 0.1127232164144516, "rewards/gemini_judge_reward_func/std": 0.2601417303085327, "rewards/semantic_correctness_reward_func/mean": 0.4303452670574188, "rewards/semantic_correctness_reward_func/std": 0.21427829563617706, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 169.38394165039062, "completions/mean_terminated_length": 153.84544372558594, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.14749690580854424, "grad_norm": 0.02057456597685814, "kl": 0.016883373260498047, "learning_rate": 6.763506794167207e-06, "loss": -0.0115, "num_tokens": 154183800.0, "reward": 0.4381869435310364, "reward_std": 0.0673794075846672, "rewards/gemini_judge_reward_func/mean": 0.1138392835855484, "rewards/gemini_judge_reward_func/std": 0.22432857751846313, "rewards/semantic_correctness_reward_func/mean": 0.4260060787200928, "rewards/semantic_correctness_reward_func/std": 0.19526949524879456, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044642857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 201.54019165039062, "completions/mean_terminated_length": 163.1074676513672, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.1478383338312492, "grad_norm": 0.019249025732278824, "kl": 0.019617557525634766, "learning_rate": 6.748025068294067e-06, "loss": -0.0242, "num_tokens": 154560981.0, "reward": 0.3731415867805481, "reward_std": 0.05926031246781349, "rewards/gemini_judge_reward_func/mean": 0.1082589253783226, "rewards/gemini_judge_reward_func/std": 0.22397971153259277, "rewards/semantic_correctness_reward_func/mean": 0.4336898624897003, "rewards/semantic_correctness_reward_func/std": 0.2114102691411972, "rewards/xmlcount_reward_func/mean": 0.6077500581741333, "rewards/xmlcount_reward_func/std": 0.48996880650520325, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 165.08929443359375, "completions/mean_terminated_length": 153.42987060546875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.14817976185395415, "grad_norm": 0.021220970898866653, "kl": 0.019231796264648438, "learning_rate": 6.732524226298841e-06, "loss": -0.0126, "num_tokens": 154907193.0, "reward": 0.45769333839416504, "reward_std": 0.060595184564590454, "rewards/gemini_judge_reward_func/mean": 0.1350446492433548, "rewards/gemini_judge_reward_func/std": 0.23772463202476501, "rewards/semantic_correctness_reward_func/mean": 0.44537705183029175, "rewards/semantic_correctness_reward_func/std": 0.20738016068935394, "rewards/xmlcount_reward_func/mean": 0.786500096321106, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 170.7678680419922, "completions/mean_terminated_length": 147.28439331054688, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.14852118987665913, "grad_norm": 0.020631877705454826, "kl": 0.018312454223632812, "learning_rate": 6.717004437696249e-06, "loss": -0.0163, "num_tokens": 155285817.0, "reward": 0.4152604341506958, "reward_std": 0.06980551779270172, "rewards/gemini_judge_reward_func/mean": 0.1662946492433548, "rewards/gemini_judge_reward_func/std": 0.30590617656707764, "rewards/semantic_correctness_reward_func/mean": 0.4470253586769104, "rewards/semantic_correctness_reward_func/std": 0.22271786630153656, "rewards/xmlcount_reward_func/mean": 0.6483437418937683, "rewards/xmlcount_reward_func/std": 0.4749422073364258, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 169.83482360839844, "completions/mean_terminated_length": 146.32568359375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.14886261789936409, "grad_norm": 0.019136464223265648, "kl": 0.020123004913330078, "learning_rate": 6.701465872208216e-06, "loss": 0.0038, "num_tokens": 155641420.0, "reward": 0.4250961244106293, "reward_std": 0.06240704655647278, "rewards/gemini_judge_reward_func/mean": 0.1138392835855484, "rewards/gemini_judge_reward_func/std": 0.2141004502773285, "rewards/semantic_correctness_reward_func/mean": 0.4320519268512726, "rewards/semantic_correctness_reward_func/std": 0.1874925047159195, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0535714285714286, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 195.6741180419922, "completions/mean_terminated_length": 148.78773498535156, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.14920404592206907, "grad_norm": 0.020079955458641052, "kl": 0.019093990325927734, "learning_rate": 6.685908699762003e-06, "loss": 0.0153, "num_tokens": 156023519.0, "reward": 0.407644659280777, "reward_std": 0.07643434405326843, "rewards/gemini_judge_reward_func/mean": 0.1517857164144516, "rewards/gemini_judge_reward_func/std": 0.2706849277019501, "rewards/semantic_correctness_reward_func/mean": 0.4476517140865326, "rewards/semantic_correctness_reward_func/std": 0.20519518852233887, "rewards/xmlcount_reward_func/mean": 0.643500030040741, "rewards/xmlcount_reward_func/std": 0.48071080446243286, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 1024.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 186.77232360839844, "completions/mean_terminated_length": 155.76388549804688, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.14954547394477402, "grad_norm": 0.019271496683359146, "kl": 0.01635265350341797, "learning_rate": 6.670333090488357e-06, "loss": -0.0304, "num_tokens": 156369764.0, "reward": 0.423153817653656, "reward_std": 0.05922694131731987, "rewards/gemini_judge_reward_func/mean": 0.09375, "rewards/gemini_judge_reward_func/std": 0.18679800629615784, "rewards/semantic_correctness_reward_func/mean": 0.4267689883708954, "rewards/semantic_correctness_reward_func/std": 0.19564877450466156, "rewards/xmlcount_reward_func/mean": 0.7507500648498535, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 177.63394165039062, "completions/mean_terminated_length": 170.00901794433594, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.14988690196747897, "grad_norm": 0.019324006512761116, "kl": 0.016142845153808594, "learning_rate": 6.654739214719642e-06, "loss": -0.0193, "num_tokens": 156725898.0, "reward": 0.4374414384365082, "reward_std": 0.06267654150724411, "rewards/gemini_judge_reward_func/mean": 0.0948660746216774, "rewards/gemini_judge_reward_func/std": 0.19719554483890533, "rewards/semantic_correctness_reward_func/mean": 0.4211089313030243, "rewards/semantic_correctness_reward_func/std": 0.20056340098381042, "rewards/xmlcount_reward_func/mean": 0.7881830930709839, "rewards/xmlcount_reward_func/std": 0.40579503774642944, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 168.65625, "completions/mean_terminated_length": 149.1278533935547, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.15022832999018396, "grad_norm": 0.02156071364879608, "kl": 0.0205535888671875, "learning_rate": 6.6391272429879886e-06, "loss": -0.005, "num_tokens": 157090797.0, "reward": 0.41712579131126404, "reward_std": 0.07244788855314255, "rewards/gemini_judge_reward_func/mean": 0.1171875, "rewards/gemini_judge_reward_func/std": 0.2515541613101959, "rewards/semantic_correctness_reward_func/mean": 0.40336090326309204, "rewards/semantic_correctness_reward_func/std": 0.21470442414283752, "rewards/xmlcount_reward_func/mean": 0.7239464521408081, "rewards/xmlcount_reward_func/std": 0.44382214546203613, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 162.4553680419922, "completions/mean_terminated_length": 154.69369506835938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1505697580128889, "grad_norm": 0.020700950175523758, "kl": 0.01773834228515625, "learning_rate": 6.6234973460234184e-06, "loss": -0.0183, "num_tokens": 157448539.0, "reward": 0.4645825922489166, "reward_std": 0.08551337569952011, "rewards/gemini_judge_reward_func/mean": 0.1584821492433548, "rewards/gemini_judge_reward_func/std": 0.28606563806533813, "rewards/semantic_correctness_reward_func/mean": 0.4790647327899933, "rewards/semantic_correctness_reward_func/std": 0.23252920806407928, "rewards/xmlcount_reward_func/mean": 0.7634419798851013, "rewards/xmlcount_reward_func/std": 0.4243088662624359, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 169.44644165039062, "completions/mean_terminated_length": 149.93606567382812, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.15091118603559386, "grad_norm": 0.02045726776123047, "kl": 0.016900062561035156, "learning_rate": 6.607849694751978e-06, "loss": -0.0017, "num_tokens": 157800091.0, "reward": 0.4241785407066345, "reward_std": 0.06672972440719604, "rewards/gemini_judge_reward_func/mean": 0.1205357164144516, "rewards/gemini_judge_reward_func/std": 0.25163623690605164, "rewards/semantic_correctness_reward_func/mean": 0.45698192715644836, "rewards/semantic_correctness_reward_func/std": 0.2180858850479126, "rewards/xmlcount_reward_func/mean": 0.7114196419715881, "rewards/xmlcount_reward_func/std": 0.45252570509910583, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 184.04019165039062, "completions/mean_terminated_length": 160.92201232910156, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.15125261405829885, "grad_norm": 0.020864713937044144, "kl": 0.020277023315429688, "learning_rate": 6.592184460293878e-06, "loss": -0.0066, "num_tokens": 158173060.0, "reward": 0.3883172571659088, "reward_std": 0.051999613642692566, "rewards/gemini_judge_reward_func/mean": 0.0825892835855484, "rewards/gemini_judge_reward_func/std": 0.20335854589939117, "rewards/semantic_correctness_reward_func/mean": 0.3843896985054016, "rewards/semantic_correctness_reward_func/std": 0.20913942158222198, "rewards/xmlcount_reward_func/mean": 0.6960089802742004, "rewards/xmlcount_reward_func/std": 0.46085411310195923, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 176.4553680419922, "completions/mean_terminated_length": 157.10501098632812, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.1515940420810038, "grad_norm": 0.021549487486481667, "kl": 0.017778396606445312, "learning_rate": 6.576501813961609e-06, "loss": 0.0159, "num_tokens": 158517066.0, "reward": 0.44117918610572815, "reward_std": 0.07043396681547165, "rewards/gemini_judge_reward_func/mean": 0.1283482164144516, "rewards/gemini_judge_reward_func/std": 0.22706440091133118, "rewards/semantic_correctness_reward_func/mean": 0.4476993978023529, "rewards/semantic_correctness_reward_func/std": 0.2108486443758011, "rewards/xmlcount_reward_func/mean": 0.7507500648498535, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 172.6607208251953, "completions/mean_terminated_length": 153.22373962402344, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.15193547010370875, "grad_norm": 0.019931474700570107, "kl": 0.01573467254638672, "learning_rate": 6.560801927258081e-06, "loss": -0.0191, "num_tokens": 158853982.0, "reward": 0.4490154981613159, "reward_std": 0.06086720898747444, "rewards/gemini_judge_reward_func/mean": 0.0825892835855484, "rewards/gemini_judge_reward_func/std": 0.1977689266204834, "rewards/semantic_correctness_reward_func/mean": 0.41752371191978455, "rewards/semantic_correctness_reward_func/std": 0.18293553590774536, "rewards/xmlcount_reward_func/mean": 0.8311875462532043, "rewards/xmlcount_reward_func/std": 0.3765355050563812, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 159.55804443359375, "completions/mean_terminated_length": 155.68162536621094, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.15227689812641373, "grad_norm": 0.021299151703715324, "kl": 0.017287254333496094, "learning_rate": 6.545084971874738e-06, "loss": 0.0017, "num_tokens": 159219791.0, "reward": 0.431675523519516, "reward_std": 0.06107047200202942, "rewards/gemini_judge_reward_func/mean": 0.1082589253783226, "rewards/gemini_judge_reward_func/std": 0.2124195545911789, "rewards/semantic_correctness_reward_func/mean": 0.44035953283309937, "rewards/semantic_correctness_reward_func/std": 0.21529170870780945, "rewards/xmlcount_reward_func/mean": 0.7507500648498535, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 1024.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 185.9241180419922, "completions/mean_terminated_length": 154.88426208496094, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.1526183261491187, "grad_norm": 0.019747041165828705, "kl": 0.015558242797851562, "learning_rate": 6.529351119689687e-06, "loss": -0.0098, "num_tokens": 159573474.0, "reward": 0.4196682274341583, "reward_std": 0.059854909777641296, "rewards/gemini_judge_reward_func/mean": 0.1116071417927742, "rewards/gemini_judge_reward_func/std": 0.21397185325622559, "rewards/semantic_correctness_reward_func/mean": 0.4272516667842865, "rewards/semantic_correctness_reward_func/std": 0.20560157299041748, "rewards/xmlcount_reward_func/mean": 0.7239375710487366, "rewards/xmlcount_reward_func/std": 0.4488601088523865, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 165.14732360839844, "completions/mean_terminated_length": 149.5318145751953, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.15295975417182364, "grad_norm": 0.021277599036693573, "kl": 0.02167987823486328, "learning_rate": 6.513600542765816e-06, "loss": -0.0232, "num_tokens": 159928331.0, "reward": 0.4534654915332794, "reward_std": 0.07899215072393417, "rewards/gemini_judge_reward_func/mean": 0.1584821492433548, "rewards/gemini_judge_reward_func/std": 0.2688947916030884, "rewards/semantic_correctness_reward_func/mean": 0.45780062675476074, "rewards/semantic_correctness_reward_func/std": 0.2195635586977005, "rewards/xmlcount_reward_func/mean": 0.7462812662124634, "rewards/xmlcount_reward_func/std": 0.4369716942310333, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 147.91519165039062, "completions/mean_terminated_length": 140.02252197265625, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.15330118219452862, "grad_norm": 0.02262871339917183, "kl": 0.02004528045654297, "learning_rate": 6.49783341334891e-06, "loss": -0.0262, "num_tokens": 160286236.0, "reward": 0.41993653774261475, "reward_std": 0.07568960636854172, "rewards/gemini_judge_reward_func/mean": 0.1361607164144516, "rewards/gemini_judge_reward_func/std": 0.22804586589336395, "rewards/semantic_correctness_reward_func/mean": 0.4688611328601837, "rewards/semantic_correctness_reward_func/std": 0.19843356311321259, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 179.52679443359375, "completions/mean_terminated_length": 148.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.15364261021723358, "grad_norm": 0.019460123032331467, "kl": 0.021003246307373047, "learning_rate": 6.4820499038657695e-06, "loss": 0.0156, "num_tokens": 160678002.0, "reward": 0.3975132405757904, "reward_std": 0.060600072145462036, "rewards/gemini_judge_reward_func/mean": 0.1439732164144516, "rewards/gemini_judge_reward_func/std": 0.27340537309646606, "rewards/semantic_correctness_reward_func/mean": 0.4305214285850525, "rewards/semantic_correctness_reward_func/std": 0.2286413013935089, "rewards/xmlcount_reward_func/mean": 0.6345491409301758, "rewards/xmlcount_reward_func/std": 0.48328086733818054, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 165.75894165039062, "completions/mean_terminated_length": 158.0270233154297, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.15398403823993853, "grad_norm": 0.020578309893608093, "kl": 0.017749786376953125, "learning_rate": 6.466250186922325e-06, "loss": -0.0127, "num_tokens": 161026600.0, "reward": 0.4278218448162079, "reward_std": 0.055558666586875916, "rewards/gemini_judge_reward_func/mean": 0.0915178582072258, "rewards/gemini_judge_reward_func/std": 0.1952219009399414, "rewards/semantic_correctness_reward_func/mean": 0.3830733299255371, "rewards/semantic_correctness_reward_func/std": 0.19349335134029388, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 170.3125, "completions/mean_terminated_length": 154.79090881347656, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.1543254662626435, "grad_norm": 0.01939302682876587, "kl": 0.016997814178466797, "learning_rate": 6.450434435301751e-06, "loss": -0.0112, "num_tokens": 161412190.0, "reward": 0.4270239770412445, "reward_std": 0.0703679621219635, "rewards/gemini_judge_reward_func/mean": 0.1104910746216774, "rewards/gemini_judge_reward_func/std": 0.21520504355430603, "rewards/semantic_correctness_reward_func/mean": 0.45732492208480835, "rewards/semantic_correctness_reward_func/std": 0.20194603502750397, "rewards/xmlcount_reward_func/mean": 0.7284063100814819, "rewards/xmlcount_reward_func/std": 0.4465976059436798, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 164.73214721679688, "completions/mean_terminated_length": 153.06788635253906, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.15466689428534847, "grad_norm": 0.02077455259859562, "kl": 0.015947818756103516, "learning_rate": 6.434602821962571e-06, "loss": -0.0275, "num_tokens": 161741334.0, "reward": 0.47266238927841187, "reward_std": 0.07710576057434082, "rewards/gemini_judge_reward_func/mean": 0.1462053507566452, "rewards/gemini_judge_reward_func/std": 0.2793271541595459, "rewards/semantic_correctness_reward_func/mean": 0.4264009892940521, "rewards/semantic_correctness_reward_func/std": 0.2282664179801941, "rewards/xmlcount_reward_func/mean": 0.8222500681877136, "rewards/xmlcount_reward_func/std": 0.3842346966266632, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 171.4732208251953, "completions/mean_terminated_length": 152.00912475585938, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.15500832230805342, "grad_norm": 0.019553987309336662, "kl": 0.018682479858398438, "learning_rate": 6.418755520036775e-06, "loss": -0.0309, "num_tokens": 162107304.0, "reward": 0.4157797396183014, "reward_std": 0.054216425865888596, "rewards/gemini_judge_reward_func/mean": 0.0814732164144516, "rewards/gemini_judge_reward_func/std": 0.17334242165088654, "rewards/semantic_correctness_reward_func/mean": 0.45020225644111633, "rewards/semantic_correctness_reward_func/std": 0.2048385888338089, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 167.2991180419922, "completions/mean_terminated_length": 147.73971557617188, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.1553497503307584, "grad_norm": 0.021014470607042313, "kl": 0.019474029541015625, "learning_rate": 6.402892702827916e-06, "loss": 0.015, "num_tokens": 162469799.0, "reward": 0.4399973154067993, "reward_std": 0.08108548820018768, "rewards/gemini_judge_reward_func/mean": 0.1584821492433548, "rewards/gemini_judge_reward_func/std": 0.2688947916030884, "rewards/semantic_correctness_reward_func/mean": 0.4530222713947296, "rewards/semantic_correctness_reward_func/std": 0.19590409100055695, "rewards/xmlcount_reward_func/mean": 0.7150000929832458, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 163.92857360839844, "completions/mean_terminated_length": 152.25340270996094, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.15569117835346336, "grad_norm": 0.021115519106388092, "kl": 0.02216339111328125, "learning_rate": 6.387014543809224e-06, "loss": 0.0144, "num_tokens": 162832799.0, "reward": 0.43464815616607666, "reward_std": 0.07387977093458176, "rewards/gemini_judge_reward_func/mean": 0.1339285671710968, "rewards/gemini_judge_reward_func/std": 0.24815140664577484, "rewards/semantic_correctness_reward_func/mean": 0.41505321860313416, "rewards/semantic_correctness_reward_func/std": 0.195115327835083, "rewards/xmlcount_reward_func/mean": 0.745165228843689, "rewards/xmlcount_reward_func/std": 0.43663734197616577, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 158.75894165039062, "completions/mean_terminated_length": 154.87893676757812, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.15603260637616834, "grad_norm": 0.01938485912978649, "kl": 0.01820230484008789, "learning_rate": 6.371121216621698e-06, "loss": 0.0089, "num_tokens": 163185189.0, "reward": 0.4345279633998871, "reward_std": 0.056550104171037674, "rewards/gemini_judge_reward_func/mean": 0.1428571492433548, "rewards/gemini_judge_reward_func/std": 0.2288455367088318, "rewards/semantic_correctness_reward_func/mean": 0.4211753010749817, "rewards/semantic_correctness_reward_func/std": 0.16808317601680756, "rewards/xmlcount_reward_func/mean": 0.7328750491142273, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 161.32589721679688, "completions/mean_terminated_length": 149.61538696289062, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.1563740343988733, "grad_norm": 0.02125578746199608, "kl": 0.01828622817993164, "learning_rate": 6.355212895072223e-06, "loss": 0.003, "num_tokens": 163542542.0, "reward": 0.44018012285232544, "reward_std": 0.05906569957733154, "rewards/gemini_judge_reward_func/mean": 0.1305803507566452, "rewards/gemini_judge_reward_func/std": 0.24711813032627106, "rewards/semantic_correctness_reward_func/mean": 0.42258793115615845, "rewards/semantic_correctness_reward_func/std": 0.20304962992668152, "rewards/xmlcount_reward_func/mean": 0.7585759162902832, "rewards/xmlcount_reward_func/std": 0.4274976849555969, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 163.1607208251953, "completions/mean_terminated_length": 155.40541076660156, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.15671546242157824, "grad_norm": 0.021676773205399513, "kl": 0.0220947265625, "learning_rate": 6.339289753131649e-06, "loss": -0.0109, "num_tokens": 163883274.0, "reward": 0.4686073660850525, "reward_std": 0.06618095934391022, "rewards/gemini_judge_reward_func/mean": 0.1361607164144516, "rewards/gemini_judge_reward_func/std": 0.2457905411720276, "rewards/semantic_correctness_reward_func/mean": 0.4619653820991516, "rewards/semantic_correctness_reward_func/std": 0.18577620387077332, "rewards/xmlcount_reward_func/mean": 0.8043750524520874, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0580357142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 212.57589721679688, "completions/mean_terminated_length": 162.58294677734375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.15705689044428323, "grad_norm": 0.017990613356232643, "kl": 0.01677417755126953, "learning_rate": 6.323351964932909e-06, "loss": -0.009, "num_tokens": 164261219.0, "reward": 0.426357239484787, "reward_std": 0.05332663282752037, "rewards/gemini_judge_reward_func/mean": 0.1205357164144516, "rewards/gemini_judge_reward_func/std": 0.2295006811618805, "rewards/semantic_correctness_reward_func/mean": 0.4741254448890686, "rewards/semantic_correctness_reward_func/std": 0.20388510823249817, "rewards/xmlcount_reward_func/mean": 0.708294689655304, "rewards/xmlcount_reward_func/std": 0.4551132023334503, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 176.0491180419922, "completions/mean_terminated_length": 152.71099853515625, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.15739831846698818, "grad_norm": 0.02143760584294796, "kl": 0.021224498748779297, "learning_rate": 6.3073997047691e-06, "loss": -0.0043, "num_tokens": 164629886.0, "reward": 0.44384151697158813, "reward_std": 0.05540228635072708, "rewards/gemini_judge_reward_func/mean": 0.1551339328289032, "rewards/gemini_judge_reward_func/std": 0.2661517262458801, "rewards/semantic_correctness_reward_func/mean": 0.47893956303596497, "rewards/semantic_correctness_reward_func/std": 0.21486115455627441, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 161.59375, "completions/mean_terminated_length": 157.72647094726562, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.15773974648969313, "grad_norm": 0.020959317684173584, "kl": 0.020437240600585938, "learning_rate": 6.291433147091583e-06, "loss": -0.0102, "num_tokens": 164976791.0, "reward": 0.421293705701828, "reward_std": 0.05760593339800835, "rewards/gemini_judge_reward_func/mean": 0.1272321492433548, "rewards/gemini_judge_reward_func/std": 0.2414351999759674, "rewards/semantic_correctness_reward_func/mean": 0.42200401425361633, "rewards/semantic_correctness_reward_func/std": 0.17594051361083984, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 179.82144165039062, "completions/mean_terminated_length": 156.58714294433594, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.15808117451239811, "grad_norm": 0.018165314570069313, "kl": 0.02169036865234375, "learning_rate": 6.275452466508076e-06, "loss": -0.0342, "num_tokens": 165367455.0, "reward": 0.36975857615470886, "reward_std": 0.05713449418544769, "rewards/gemini_judge_reward_func/mean": 0.0870535746216774, "rewards/gemini_judge_reward_func/std": 0.20831865072250366, "rewards/semantic_correctness_reward_func/mean": 0.42343568801879883, "rewards/semantic_correctness_reward_func/std": 0.18575748801231384, "rewards/xmlcount_reward_func/mean": 0.6256250739097595, "rewards/xmlcount_reward_func/std": 0.48569241166114807, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 1024.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 189.3482208251953, "completions/mean_terminated_length": 158.4351806640625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.15842260253510307, "grad_norm": 0.018870696425437927, "kl": 0.016434192657470703, "learning_rate": 6.259457837780741e-06, "loss": -0.0225, "num_tokens": 165704941.0, "reward": 0.46567752957344055, "reward_std": 0.07933323830366135, "rewards/gemini_judge_reward_func/mean": 0.1350446492433548, "rewards/gemini_judge_reward_func/std": 0.2558937668800354, "rewards/semantic_correctness_reward_func/mean": 0.44109275937080383, "rewards/semantic_correctness_reward_func/std": 0.2011324018239975, "rewards/xmlcount_reward_func/mean": 0.8086027503013611, "rewards/xmlcount_reward_func/std": 0.3939329981803894, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 171.37054443359375, "completions/mean_terminated_length": 151.9040985107422, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.15876403055780802, "grad_norm": 0.01999264396727085, "kl": 0.016565322875976562, "learning_rate": 6.243449435824276e-06, "loss": -0.026, "num_tokens": 166053888.0, "reward": 0.42705729603767395, "reward_std": 0.059816788882017136, "rewards/gemini_judge_reward_func/mean": 0.0881696417927742, "rewards/gemini_judge_reward_func/std": 0.17328467965126038, "rewards/semantic_correctness_reward_func/mean": 0.42169705033302307, "rewards/semantic_correctness_reward_func/std": 0.191665381193161, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 153.91519165039062, "completions/mean_terminated_length": 146.0765838623047, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.159105458580513, "grad_norm": 0.019290044903755188, "kl": 0.019309520721435547, "learning_rate": 6.227427435703997e-06, "loss": -0.0179, "num_tokens": 166413285.0, "reward": 0.40731704235076904, "reward_std": 0.05208640545606613, "rewards/gemini_judge_reward_func/mean": 0.1071428582072258, "rewards/gemini_judge_reward_func/std": 0.2136441320180893, "rewards/semantic_correctness_reward_func/mean": 0.4280492961406708, "rewards/semantic_correctness_reward_func/std": 0.20282071828842163, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903594970703, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 179.08929443359375, "completions/mean_terminated_length": 155.83485412597656, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.15944688660321796, "grad_norm": 0.01988377422094345, "kl": 0.016337871551513672, "learning_rate": 6.211392012633932e-06, "loss": -0.0146, "num_tokens": 166782753.0, "reward": 0.451289564371109, "reward_std": 0.06578972935676575, "rewards/gemini_judge_reward_func/mean": 0.1194196417927742, "rewards/gemini_judge_reward_func/std": 0.2331113964319229, "rewards/semantic_correctness_reward_func/mean": 0.46342092752456665, "rewards/semantic_correctness_reward_func/std": 0.2133999615907669, "rewards/xmlcount_reward_func/mean": 0.777093768119812, "rewards/xmlcount_reward_func/std": 0.4154271185398102, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 155.96429443359375, "completions/mean_terminated_length": 140.1818084716797, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1597883146259229, "grad_norm": 0.020306117832660675, "kl": 0.022162914276123047, "learning_rate": 6.1953433419748995e-06, "loss": -0.0045, "num_tokens": 167132245.0, "reward": 0.364609032869339, "reward_std": 0.06053777411580086, "rewards/gemini_judge_reward_func/mean": 0.0904017835855484, "rewards/gemini_judge_reward_func/std": 0.19502632319927216, "rewards/semantic_correctness_reward_func/mean": 0.39322349429130554, "rewards/semantic_correctness_reward_func/std": 0.21105892956256866, "rewards/xmlcount_reward_func/mean": 0.6245089769363403, "rewards/xmlcount_reward_func/std": 0.48511284589767456, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 185.08482360839844, "completions/mean_terminated_length": 158.02304077148438, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.1601297426486279, "grad_norm": 0.046392910182476044, "kl": 0.032955169677734375, "learning_rate": 6.179281599232592e-06, "loss": -0.0111, "num_tokens": 167492956.0, "reward": 0.40365663170814514, "reward_std": 0.05007166042923927, "rewards/gemini_judge_reward_func/mean": 0.1216517835855484, "rewards/gemini_judge_reward_func/std": 0.23908916115760803, "rewards/semantic_correctness_reward_func/mean": 0.4164794981479645, "rewards/semantic_correctness_reward_func/std": 0.20123189687728882, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 161.73214721679688, "completions/mean_terminated_length": 157.865478515625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.16047117067133285, "grad_norm": 0.020119668915867805, "kl": 0.015314102172851562, "learning_rate": 6.163206960055652e-06, "loss": -0.0156, "num_tokens": 167859308.0, "reward": 0.46480125188827515, "reward_std": 0.05998440086841583, "rewards/gemini_judge_reward_func/mean": 0.0948660746216774, "rewards/gemini_judge_reward_func/std": 0.17457951605319977, "rewards/semantic_correctness_reward_func/mean": 0.4182741343975067, "rewards/semantic_correctness_reward_func/std": 0.21178297698497772, "rewards/xmlcount_reward_func/mean": 0.8580000996589661, "rewards/xmlcount_reward_func/std": 0.35106155276298523, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 182.29464721679688, "completions/mean_terminated_length": 159.12843322753906, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.1608125986940378, "grad_norm": 0.01942136511206627, "kl": 0.018321990966796875, "learning_rate": 6.147119600233758e-06, "loss": -0.0183, "num_tokens": 168216734.0, "reward": 0.44124796986579895, "reward_std": 0.06361334770917892, "rewards/gemini_judge_reward_func/mean": 0.1328125, "rewards/gemini_judge_reward_func/std": 0.26031482219696045, "rewards/semantic_correctness_reward_func/mean": 0.43911466002464294, "rewards/semantic_correctness_reward_func/std": 0.2211882770061493, "rewards/xmlcount_reward_func/mean": 0.7507500648498535, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 1024.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 182.95089721679688, "completions/mean_terminated_length": 151.80093383789062, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.16115402671674278, "grad_norm": 0.02266276441514492, "kl": 0.01579761505126953, "learning_rate": 6.131019695695702e-06, "loss": -0.0148, "num_tokens": 168588127.0, "reward": 0.4200635850429535, "reward_std": 0.06345196068286896, "rewards/gemini_judge_reward_func/mean": 0.1194196417927742, "rewards/gemini_judge_reward_func/std": 0.22453764081001282, "rewards/semantic_correctness_reward_func/mean": 0.4404517710208893, "rewards/semantic_correctness_reward_func/std": 0.19075731933116913, "rewards/xmlcount_reward_func/mean": 0.7105134129524231, "rewards/xmlcount_reward_func/std": 0.4553045928478241, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 163.4241180419922, "completions/mean_terminated_length": 147.7772674560547, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.16149545473944774, "grad_norm": 0.020487939938902855, "kl": 0.0145416259765625, "learning_rate": 6.114907422507459e-06, "loss": -0.03, "num_tokens": 168923986.0, "reward": 0.48719358444213867, "reward_std": 0.07441079616546631, "rewards/gemini_judge_reward_func/mean": 0.171875, "rewards/gemini_judge_reward_func/std": 0.27919498085975647, "rewards/semantic_correctness_reward_func/mean": 0.44771772623062134, "rewards/semantic_correctness_reward_func/std": 0.22752991318702698, "rewards/xmlcount_reward_func/mean": 0.8222500681877136, "rewards/xmlcount_reward_func/std": 0.3842346966266632, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 185.5044708251953, "completions/mean_terminated_length": 158.4562225341797, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.16183688276215272, "grad_norm": 0.019220322370529175, "kl": 0.018144607543945312, "learning_rate": 6.098782956870266e-06, "loss": -0.0505, "num_tokens": 169316687.0, "reward": 0.383688360452652, "reward_std": 0.06443572044372559, "rewards/gemini_judge_reward_func/mean": 0.1183035746216774, "rewards/gemini_judge_reward_func/std": 0.23187629878520966, "rewards/semantic_correctness_reward_func/mean": 0.43058452010154724, "rewards/semantic_correctness_reward_func/std": 0.19228224456310272, "rewards/xmlcount_reward_func/mean": 0.6256250739097595, "rewards/xmlcount_reward_func/std": 0.48569241166114807, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 159.50894165039062, "completions/mean_terminated_length": 151.72071838378906, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.16217831078485767, "grad_norm": 0.020380543544888496, "kl": 0.01611471176147461, "learning_rate": 6.0826464751187e-06, "loss": -0.0099, "num_tokens": 169640537.0, "reward": 0.47983115911483765, "reward_std": 0.0633583590388298, "rewards/gemini_judge_reward_func/mean": 0.1350446492433548, "rewards/gemini_judge_reward_func/std": 0.2317548543214798, "rewards/semantic_correctness_reward_func/mean": 0.4488253891468048, "rewards/semantic_correctness_reward_func/std": 0.2299881875514984, "rewards/xmlcount_reward_func/mean": 0.8401206135749817, "rewards/xmlcount_reward_func/std": 0.3684559166431427, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 186.27679443359375, "completions/mean_terminated_length": 163.2201690673828, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.16251973880756262, "grad_norm": 0.01900290884077549, "kl": 0.01510167121887207, "learning_rate": 6.066498153718735e-06, "loss": 0.0154, "num_tokens": 170012887.0, "reward": 0.4446248412132263, "reward_std": 0.051946092396974564, "rewards/gemini_judge_reward_func/mean": 0.1216517835855484, "rewards/gemini_judge_reward_func/std": 0.23435330390930176, "rewards/semantic_correctness_reward_func/mean": 0.44257062673568726, "rewards/semantic_correctness_reward_func/std": 0.22105993330478668, "rewards/xmlcount_reward_func/mean": 0.7686249613761902, "rewards/xmlcount_reward_func/std": 0.42356839776039124, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 152.46875, "completions/mean_terminated_length": 148.560546875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.1628611668302676, "grad_norm": 0.01916034147143364, "kl": 0.017156600952148438, "learning_rate": 6.0503381692658305e-06, "loss": -0.007, "num_tokens": 170356148.0, "reward": 0.4668045938014984, "reward_std": 0.06646783649921417, "rewards/gemini_judge_reward_func/mean": 0.1339285671710968, "rewards/gemini_judge_reward_func/std": 0.2537356913089752, "rewards/semantic_correctness_reward_func/mean": 0.4395405948162079, "rewards/semantic_correctness_reward_func/std": 0.2333323359489441, "rewards/xmlcount_reward_func/mean": 0.8133125305175781, "rewards/xmlcount_reward_func/std": 0.39157772064208984, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 167.65625, "completions/mean_terminated_length": 152.08636474609375, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.16320259485297256, "grad_norm": 0.020782776176929474, "kl": 0.01477193832397461, "learning_rate": 6.034166698482984e-06, "loss": 0.0215, "num_tokens": 170715555.0, "reward": 0.4693361222743988, "reward_std": 0.07258699834346771, "rewards/gemini_judge_reward_func/mean": 0.1529017835855484, "rewards/gemini_judge_reward_func/std": 0.27057167887687683, "rewards/semantic_correctness_reward_func/mean": 0.4678769111633301, "rewards/semantic_correctness_reward_func/std": 0.22107519209384918, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 149.625, "completions/mean_terminated_length": 149.625, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.1635440228756775, "grad_norm": 0.020050855353474617, "kl": 0.019238948822021484, "learning_rate": 6.0179839182188125e-06, "loss": 0.0007, "num_tokens": 171058499.0, "reward": 0.4546305239200592, "reward_std": 0.05301572382450104, "rewards/gemini_judge_reward_func/mean": 0.1462053507566452, "rewards/gemini_judge_reward_func/std": 0.24282783269882202, "rewards/semantic_correctness_reward_func/mean": 0.44431307911872864, "rewards/semantic_correctness_reward_func/std": 0.20877443253993988, "rewards/xmlcount_reward_func/mean": 0.7682143449783325, "rewards/xmlcount_reward_func/std": 0.4233846962451935, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 163.11607360839844, "completions/mean_terminated_length": 155.36036682128906, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.1638854508983825, "grad_norm": 0.01931094005703926, "kl": 0.01724720001220703, "learning_rate": 6.001790005445607e-06, "loss": -0.0094, "num_tokens": 171442269.0, "reward": 0.42174994945526123, "reward_std": 0.07509782910346985, "rewards/gemini_judge_reward_func/mean": 0.140625, "rewards/gemini_judge_reward_func/std": 0.2523016333580017, "rewards/semantic_correctness_reward_func/mean": 0.464526504278183, "rewards/semantic_correctness_reward_func/std": 0.20688366889953613, "rewards/xmlcount_reward_func/mean": 0.6814866662025452, "rewards/xmlcount_reward_func/std": 0.46647319197654724, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 158.91964721679688, "completions/mean_terminated_length": 155.0403594970703, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.16422687892108745, "grad_norm": 0.019207188859581947, "kl": 0.019034385681152344, "learning_rate": 5.985585137257401e-06, "loss": -0.0056, "num_tokens": 171771759.0, "reward": 0.5069795846939087, "reward_std": 0.07559894770383835, "rewards/gemini_judge_reward_func/mean": 0.1729910671710968, "rewards/gemini_judge_reward_func/std": 0.264567494392395, "rewards/semantic_correctness_reward_func/mean": 0.47291556000709534, "rewards/semantic_correctness_reward_func/std": 0.23877963423728943, "rewards/xmlcount_reward_func/mean": 0.8580000996589661, "rewards/xmlcount_reward_func/std": 0.35106155276298523, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 155.44644165039062, "completions/mean_terminated_length": 147.6216278076172, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.1645683069437924, "grad_norm": 0.01969590224325657, "kl": 0.015825271606445312, "learning_rate": 5.969369490868042e-06, "loss": -0.019, "num_tokens": 172126495.0, "reward": 0.4206188917160034, "reward_std": 0.06250383704900742, "rewards/gemini_judge_reward_func/mean": 0.0904017835855484, "rewards/gemini_judge_reward_func/std": 0.21680375933647156, "rewards/semantic_correctness_reward_func/mean": 0.4146478474140167, "rewards/semantic_correctness_reward_func/std": 0.20701521635055542, "rewards/xmlcount_reward_func/mean": 0.7538214921951294, "rewards/xmlcount_reward_func/std": 0.42685988545417786, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 161.5625, "completions/mean_terminated_length": 153.7928009033203, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.16490973496649738, "grad_norm": 0.02005629986524582, "kl": 0.015123367309570312, "learning_rate": 5.953143243609235e-06, "loss": -0.0137, "num_tokens": 172462793.0, "reward": 0.4773969054222107, "reward_std": 0.06708209216594696, "rewards/gemini_judge_reward_func/mean": 0.1361607164144516, "rewards/gemini_judge_reward_func/std": 0.27587687969207764, "rewards/semantic_correctness_reward_func/mean": 0.4344128966331482, "rewards/semantic_correctness_reward_func/std": 0.23222900927066803, "rewards/xmlcount_reward_func/mean": 0.8401250243186951, "rewards/xmlcount_reward_func/std": 0.3684578537940979, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 159.9553680419922, "completions/mean_terminated_length": 152.17117309570312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.16525116298920234, "grad_norm": 0.020192833617329597, "kl": 0.014693260192871094, "learning_rate": 5.936906572928625e-06, "loss": -0.0159, "num_tokens": 172781779.0, "reward": 0.44325000047683716, "reward_std": 0.06479258835315704, "rewards/gemini_judge_reward_func/mean": 0.1004464253783226, "rewards/gemini_judge_reward_func/std": 0.22325512766838074, "rewards/semantic_correctness_reward_func/mean": 0.4066070020198822, "rewards/semantic_correctness_reward_func/std": 0.18697205185890198, "rewards/xmlcount_reward_func/mean": 0.8043750524520874, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 165.2857208251953, "completions/mean_terminated_length": 153.62896728515625, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.1655925910119073, "grad_norm": 0.02016579918563366, "kl": 0.014116764068603516, "learning_rate": 5.920659656387836e-06, "loss": -0.024, "num_tokens": 173160955.0, "reward": 0.3937126696109772, "reward_std": 0.06098075583577156, "rewards/gemini_judge_reward_func/mean": 0.0993303582072258, "rewards/gemini_judge_reward_func/std": 0.20615418255329132, "rewards/semantic_correctness_reward_func/mean": 0.4114026427268982, "rewards/semantic_correctness_reward_func/std": 0.1928534358739853, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 175.7053680419922, "completions/mean_terminated_length": 152.3577880859375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.16593401903461227, "grad_norm": 0.020396245643496513, "kl": 0.014238834381103516, "learning_rate": 5.904402671660551e-06, "loss": -0.0438, "num_tokens": 173516949.0, "reward": 0.42300131916999817, "reward_std": 0.0696081817150116, "rewards/gemini_judge_reward_func/mean": 0.109375, "rewards/gemini_judge_reward_func/std": 0.22406068444252014, "rewards/semantic_correctness_reward_func/mean": 0.3947562873363495, "rewards/semantic_correctness_reward_func/std": 0.20920373499393463, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 168.2232208251953, "completions/mean_terminated_length": 152.66363525390625, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.16627544705731723, "grad_norm": 0.02036086469888687, "kl": 0.01621556282043457, "learning_rate": 5.8881357965305444e-06, "loss": -0.0085, "num_tokens": 173878327.0, "reward": 0.4611518383026123, "reward_std": 0.08240365236997604, "rewards/gemini_judge_reward_func/mean": 0.1696428507566452, "rewards/gemini_judge_reward_func/std": 0.27956220507621765, "rewards/semantic_correctness_reward_func/mean": 0.4649732708930969, "rewards/semantic_correctness_reward_func/std": 0.21387185156345367, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 165.1116180419922, "completions/mean_terminated_length": 153.45249938964844, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.16661687508002218, "grad_norm": 0.020625097677111626, "kl": 0.015746593475341797, "learning_rate": 5.871859208889759e-06, "loss": -0.0143, "num_tokens": 174251580.0, "reward": 0.4308580458164215, "reward_std": 0.06439037621021271, "rewards/gemini_judge_reward_func/mean": 0.1104910746216774, "rewards/gemini_judge_reward_func/std": 0.22288212180137634, "rewards/semantic_correctness_reward_func/mean": 0.4266740679740906, "rewards/semantic_correctness_reward_func/std": 0.22454437613487244, "rewards/xmlcount_reward_func/mean": 0.7533169984817505, "rewards/xmlcount_reward_func/std": 0.4281919598579407, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 159.9419708251953, "completions/mean_terminated_length": 156.06727600097656, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.16695830310272716, "grad_norm": 0.02049202099442482, "kl": 0.015564441680908203, "learning_rate": 5.855573086736351e-06, "loss": -0.0176, "num_tokens": 174562027.0, "reward": 0.5011279582977295, "reward_std": 0.07227544486522675, "rewards/gemini_judge_reward_func/mean": 0.1651785671710968, "rewards/gemini_judge_reward_func/std": 0.2832260727882385, "rewards/semantic_correctness_reward_func/mean": 0.45928245782852173, "rewards/semantic_correctness_reward_func/std": 0.20571810007095337, "rewards/xmlcount_reward_func/mean": 0.8580000996589661, "rewards/xmlcount_reward_func/std": 0.35106155276298523, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 156.5178680419922, "completions/mean_terminated_length": 144.74208068847656, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.16729973112543212, "grad_norm": 0.020085543394088745, "kl": 0.016694068908691406, "learning_rate": 5.839277608172739e-06, "loss": 0.006, "num_tokens": 174919303.0, "reward": 0.41649171710014343, "reward_std": 0.0659632682800293, "rewards/gemini_judge_reward_func/mean": 0.1183035746216774, "rewards/gemini_judge_reward_func/std": 0.2142873853445053, "rewards/semantic_correctness_reward_func/mean": 0.415851354598999, "rewards/semantic_correctness_reward_func/std": 0.17620055377483368, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 159.98214721679688, "completions/mean_terminated_length": 152.1981964111328, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.16764115914813707, "grad_norm": 0.019799429923295975, "kl": 0.015845775604248047, "learning_rate": 5.82297295140367e-06, "loss": -0.0318, "num_tokens": 175240287.0, "reward": 0.4717130959033966, "reward_std": 0.07357439398765564, "rewards/gemini_judge_reward_func/mean": 0.1316964328289032, "rewards/gemini_judge_reward_func/std": 0.2635558247566223, "rewards/semantic_correctness_reward_func/mean": 0.42951181530952454, "rewards/semantic_correctness_reward_func/std": 0.229256734251976, "rewards/xmlcount_reward_func/mean": 0.8328304290771484, "rewards/xmlcount_reward_func/std": 0.373677134513855, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 171.95089721679688, "completions/mean_terminated_length": 164.2747802734375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.16798258717084205, "grad_norm": 0.020750809460878372, "kl": 0.015099048614501953, "learning_rate": 5.806659294734256e-06, "loss": 0.0052, "num_tokens": 175559164.0, "reward": 0.46661117672920227, "reward_std": 0.08188282698392868, "rewards/gemini_judge_reward_func/mean": 0.1372767835855484, "rewards/gemini_judge_reward_func/std": 0.2399667352437973, "rewards/semantic_correctness_reward_func/mean": 0.4586896598339081, "rewards/semantic_correctness_reward_func/std": 0.1945486068725586, "rewards/xmlcount_reward_func/mean": 0.799906313419342, "rewards/xmlcount_reward_func/std": 0.40196701884269714, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 162.375, "completions/mean_terminated_length": 154.61260986328125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.168324015193547, "grad_norm": 0.019258791580796242, "kl": 0.013692378997802734, "learning_rate": 5.790336816568033e-06, "loss": -0.0041, "num_tokens": 175921760.0, "reward": 0.4894059896469116, "reward_std": 0.07220742106437683, "rewards/gemini_judge_reward_func/mean": 0.1495535671710968, "rewards/gemini_judge_reward_func/std": 0.2820485234260559, "rewards/semantic_correctness_reward_func/mean": 0.4319226145744324, "rewards/semantic_correctness_reward_func/std": 0.2171761691570282, "rewards/xmlcount_reward_func/mean": 0.8580000996589661, "rewards/xmlcount_reward_func/std": 0.35106152296066284, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 149.09375, "completions/mean_terminated_length": 141.2117156982422, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.168665443216252, "grad_norm": 0.020740794017910957, "kl": 0.01919412612915039, "learning_rate": 5.774005695405008e-06, "loss": 0.0211, "num_tokens": 176274773.0, "reward": 0.42292988300323486, "reward_std": 0.061523277312517166, "rewards/gemini_judge_reward_func/mean": 0.0982142835855484, "rewards/gemini_judge_reward_func/std": 0.20736749470233917, "rewards/semantic_correctness_reward_func/mean": 0.4547029137611389, "rewards/semantic_correctness_reward_func/std": 0.18523745238780975, "rewards/xmlcount_reward_func/mean": 0.7317589521408081, "rewards/xmlcount_reward_func/std": 0.4439156651496887, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 712.0, "completions/mean_length": 156.82144165039062, "completions/mean_terminated_length": 145.04977416992188, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.16900687123895694, "grad_norm": 0.02109484001994133, "kl": 0.018070220947265625, "learning_rate": 5.7576661098397024e-06, "loss": -0.022, "num_tokens": 176663369.0, "reward": 0.415505588054657, "reward_std": 0.07010926306247711, "rewards/gemini_judge_reward_func/mean": 0.1774553507566452, "rewards/gemini_judge_reward_func/std": 0.3051034212112427, "rewards/semantic_correctness_reward_func/mean": 0.4624207317829132, "rewards/semantic_correctness_reward_func/std": 0.2060934156179428, "rewards/xmlcount_reward_func/mean": 0.6300982236862183, "rewards/xmlcount_reward_func/std": 0.48218870162963867, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 154.91519165039062, "completions/mean_terminated_length": 147.08558654785156, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.1693482992616619, "grad_norm": 0.020870883017778397, "kl": 0.014084815979003906, "learning_rate": 5.74131823855921e-06, "loss": -0.0061, "num_tokens": 177009766.0, "reward": 0.4837039113044739, "reward_std": 0.06995401531457901, "rewards/gemini_judge_reward_func/mean": 0.1696428507566452, "rewards/gemini_judge_reward_func/std": 0.28156012296676636, "rewards/semantic_correctness_reward_func/mean": 0.4704835116863251, "rewards/semantic_correctness_reward_func/std": 0.23031000792980194, "rewards/xmlcount_reward_func/mean": 0.8043751120567322, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 172.81251525878906, "completions/mean_terminated_length": 149.38531494140625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.16968972728436688, "grad_norm": 0.020632216706871986, "kl": 0.017904996871948242, "learning_rate": 5.72496226034123e-06, "loss": -0.0237, "num_tokens": 177378632.0, "reward": 0.4311949610710144, "reward_std": 0.06168566271662712, "rewards/gemini_judge_reward_func/mean": 0.1071428582072258, "rewards/gemini_judge_reward_func/std": 0.24539317190647125, "rewards/semantic_correctness_reward_func/mean": 0.44018909335136414, "rewards/semantic_correctness_reward_func/std": 0.20811137557029724, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 165.08482360839844, "completions/mean_terminated_length": 153.4253387451172, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.17003115530707183, "grad_norm": 0.01952049508690834, "kl": 0.01712656021118164, "learning_rate": 5.708598354052122e-06, "loss": 0.007, "num_tokens": 177727219.0, "reward": 0.4157637357711792, "reward_std": 0.05787918344140053, "rewards/gemini_judge_reward_func/mean": 0.1395089328289032, "rewards/gemini_judge_reward_func/std": 0.25457999110221863, "rewards/semantic_correctness_reward_func/mean": 0.4413006007671356, "rewards/semantic_correctness_reward_func/std": 0.212608203291893, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 152.03125, "completions/mean_terminated_length": 148.12107849121094, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.17037258332977678, "grad_norm": 0.020889515057206154, "kl": 0.017127513885498047, "learning_rate": 5.692226698644938e-06, "loss": -0.0171, "num_tokens": 178073166.0, "reward": 0.466022253036499, "reward_std": 0.07162578403949738, "rewards/gemini_judge_reward_func/mean": 0.1540178507566452, "rewards/gemini_judge_reward_func/std": 0.2349638193845749, "rewards/semantic_correctness_reward_func/mean": 0.4848252236843109, "rewards/semantic_correctness_reward_func/std": 0.20572321116924286, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 161.67857360839844, "completions/mean_terminated_length": 146.0, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.17071401135248176, "grad_norm": 0.020642045885324478, "kl": 0.01847362518310547, "learning_rate": 5.675847473157485e-06, "loss": -0.0314, "num_tokens": 178409774.0, "reward": 0.47827115654945374, "reward_std": 0.05761899799108505, "rewards/gemini_judge_reward_func/mean": 0.15625, "rewards/gemini_judge_reward_func/std": 0.2660224437713623, "rewards/semantic_correctness_reward_func/mean": 0.47010570764541626, "rewards/semantic_correctness_reward_func/std": 0.2151022106409073, "rewards/xmlcount_reward_func/mean": 0.8043750524520874, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 152.6741180419922, "completions/mean_terminated_length": 148.76683044433594, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.17105543937518672, "grad_norm": 0.021922262385487556, "kl": 0.014860153198242188, "learning_rate": 5.659460856710346e-06, "loss": -0.0213, "num_tokens": 178727305.0, "reward": 0.45994651317596436, "reward_std": 0.06950204819440842, "rewards/gemini_judge_reward_func/mean": 0.1462053507566452, "rewards/gemini_judge_reward_func/std": 0.26806291937828064, "rewards/semantic_correctness_reward_func/mean": 0.47231271862983704, "rewards/semantic_correctness_reward_func/std": 0.22776320576667786, "rewards/xmlcount_reward_func/mean": 0.7675045728683472, "rewards/xmlcount_reward_func/std": 0.42328277230262756, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 173.3928680419922, "completions/mean_terminated_length": 149.9816436767578, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.17139686739789167, "grad_norm": 0.021201200783252716, "kl": 0.018434524536132812, "learning_rate": 5.643067028504931e-06, "loss": -0.0353, "num_tokens": 179102705.0, "reward": 0.40908685326576233, "reward_std": 0.06099972128868103, "rewards/gemini_judge_reward_func/mean": 0.1272321492433548, "rewards/gemini_judge_reward_func/std": 0.21042221784591675, "rewards/semantic_correctness_reward_func/mean": 0.4324699938297272, "rewards/semantic_correctness_reward_func/std": 0.19328351318836212, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 169.33929443359375, "completions/mean_terminated_length": 149.82647705078125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.17173829542059665, "grad_norm": 0.01934802159667015, "kl": 0.016072750091552734, "learning_rate": 5.626666167821522e-06, "loss": -0.0409, "num_tokens": 179471349.0, "reward": 0.420356422662735, "reward_std": 0.07176917046308517, "rewards/gemini_judge_reward_func/mean": 0.1473214328289032, "rewards/gemini_judge_reward_func/std": 0.2562088966369629, "rewards/semantic_correctness_reward_func/mean": 0.44863906502723694, "rewards/semantic_correctness_reward_func/std": 0.2033475935459137, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 157.10714721679688, "completions/mean_terminated_length": 153.21974182128906, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.1720797234433016, "grad_norm": 0.021044226363301277, "kl": 0.016061782836914062, "learning_rate": 5.610258454017301e-06, "loss": -0.0048, "num_tokens": 179818133.0, "reward": 0.4817381203174591, "reward_std": 0.05548393726348877, "rewards/gemini_judge_reward_func/mean": 0.1696428507566452, "rewards/gemini_judge_reward_func/std": 0.2855140268802643, "rewards/semantic_correctness_reward_func/mean": 0.46065473556518555, "rewards/semantic_correctness_reward_func/std": 0.23221111297607422, "rewards/xmlcount_reward_func/mean": 0.8043751120567322, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 1024.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 183.8928680419922, "completions/mean_terminated_length": 152.7777862548828, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.17242115146600656, "grad_norm": 0.019027721136808395, "kl": 0.014852523803710938, "learning_rate": 5.593844066524401e-06, "loss": -0.0089, "num_tokens": 180172325.0, "reward": 0.39402300119400024, "reward_std": 0.05319085344672203, "rewards/gemini_judge_reward_func/mean": 0.0915178582072258, "rewards/gemini_judge_reward_func/std": 0.2220863550901413, "rewards/semantic_correctness_reward_func/mean": 0.3928290903568268, "rewards/semantic_correctness_reward_func/std": 0.20905858278274536, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903594970703, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 177.88839721679688, "completions/mean_terminated_length": 154.6009063720703, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.17276257948871154, "grad_norm": 0.02053193189203739, "kl": 0.01651144027709961, "learning_rate": 5.577423184847932e-06, "loss": -0.0192, "num_tokens": 180531340.0, "reward": 0.4295152425765991, "reward_std": 0.05642404407262802, "rewards/gemini_judge_reward_func/mean": 0.0837053582072258, "rewards/gemini_judge_reward_func/std": 0.18177036941051483, "rewards/semantic_correctness_reward_func/mean": 0.4116385877132416, "rewards/semantic_correctness_reward_func/std": 0.17686069011688232, "rewards/xmlcount_reward_func/mean": 0.7842634916305542, "rewards/xmlcount_reward_func/std": 0.41184648871421814, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 165.13839721679688, "completions/mean_terminated_length": 149.52272033691406, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.1731040075114165, "grad_norm": 0.01986338570713997, "kl": 0.013272762298583984, "learning_rate": 5.560995988564023e-06, "loss": 0.014, "num_tokens": 180879259.0, "reward": 0.4700644016265869, "reward_std": 0.08002512902021408, "rewards/gemini_judge_reward_func/mean": 0.1540178507566452, "rewards/gemini_judge_reward_func/std": 0.26522162556648254, "rewards/semantic_correctness_reward_func/mean": 0.43353599309921265, "rewards/semantic_correctness_reward_func/std": 0.20602788031101227, "rewards/xmlcount_reward_func/mean": 0.8043751120567322, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 157.4553680419922, "completions/mean_terminated_length": 145.6923065185547, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.17344543553412145, "grad_norm": 0.02088193967938423, "kl": 0.015688419342041016, "learning_rate": 5.544562657317863e-06, "loss": -0.0335, "num_tokens": 181218669.0, "reward": 0.4463382661342621, "reward_std": 0.07036608457565308, "rewards/gemini_judge_reward_func/mean": 0.1238839253783226, "rewards/gemini_judge_reward_func/std": 0.24027937650680542, "rewards/semantic_correctness_reward_func/mean": 0.4466731548309326, "rewards/semantic_correctness_reward_func/std": 0.2101128250360489, "rewards/xmlcount_reward_func/mean": 0.7686249613761902, "rewards/xmlcount_reward_func/std": 0.42356839776039124, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 164.67857360839844, "completions/mean_terminated_length": 145.05935668945312, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.17378686355682643, "grad_norm": 0.018736766651272774, "kl": 0.014445781707763672, "learning_rate": 5.52812337082173e-06, "loss": -0.0104, "num_tokens": 181593377.0, "reward": 0.43417415022850037, "reward_std": 0.06118585541844368, "rewards/gemini_judge_reward_func/mean": 0.1328125, "rewards/gemini_judge_reward_func/std": 0.26245927810668945, "rewards/semantic_correctness_reward_func/mean": 0.41679030656814575, "rewards/semantic_correctness_reward_func/std": 0.2354862242937088, "rewards/xmlcount_reward_func/mean": 0.7442277073860168, "rewards/xmlcount_reward_func/std": 0.4368407428264618, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 151.02679443359375, "completions/mean_terminated_length": 151.02679443359375, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.17412829157953139, "grad_norm": 0.02046004869043827, "kl": 0.01661968231201172, "learning_rate": 5.5116783088530255e-06, "loss": -0.0076, "num_tokens": 181941603.0, "reward": 0.44803786277770996, "reward_std": 0.07080157846212387, "rewards/gemini_judge_reward_func/mean": 0.1618303507566452, "rewards/gemini_judge_reward_func/std": 0.27668073773384094, "rewards/semantic_correctness_reward_func/mean": 0.4507783055305481, "rewards/semantic_correctness_reward_func/std": 0.22416646778583527, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 181.9419708251953, "completions/mean_terminated_length": 166.63180541992188, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.17446971960223637, "grad_norm": 0.021340545266866684, "kl": 0.015408992767333984, "learning_rate": 5.495227651252315e-06, "loss": -0.0083, "num_tokens": 182312514.0, "reward": 0.41029632091522217, "reward_std": 0.06794978678226471, "rewards/gemini_judge_reward_func/mean": 0.1261160671710968, "rewards/gemini_judge_reward_func/std": 0.2391100972890854, "rewards/semantic_correctness_reward_func/mean": 0.44074922800064087, "rewards/semantic_correctness_reward_func/std": 0.20661385357379913, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 167.875, "completions/mean_terminated_length": 152.30908203125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.17481114762494132, "grad_norm": 0.01973101496696472, "kl": 0.014441490173339844, "learning_rate": 5.478771577921351e-06, "loss": -0.0045, "num_tokens": 182654030.0, "reward": 0.47068971395492554, "reward_std": 0.0690293237566948, "rewards/gemini_judge_reward_func/mean": 0.140625, "rewards/gemini_judge_reward_func/std": 0.2746405303478241, "rewards/semantic_correctness_reward_func/mean": 0.46344852447509766, "rewards/semantic_correctness_reward_func/std": 0.2121262550354004, "rewards/xmlcount_reward_func/mean": 0.8043750524520874, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 161.9107208251953, "completions/mean_terminated_length": 154.14414978027344, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.17515257564764627, "grad_norm": 0.01982654258608818, "kl": 0.015091419219970703, "learning_rate": 5.4623102688211186e-06, "loss": -0.0241, "num_tokens": 182983714.0, "reward": 0.4770260155200958, "reward_std": 0.07111723721027374, "rewards/gemini_judge_reward_func/mean": 0.1116071417927742, "rewards/gemini_judge_reward_func/std": 0.2242058366537094, "rewards/semantic_correctness_reward_func/mean": 0.41016557812690735, "rewards/semantic_correctness_reward_func/std": 0.1983053833246231, "rewards/xmlcount_reward_func/mean": 0.8758750557899475, "rewards/xmlcount_reward_func/std": 0.33179107308387756, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 169.22769165039062, "completions/mean_terminated_length": 149.7123260498047, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.17549400367035126, "grad_norm": 0.018747977912425995, "kl": 0.015319347381591797, "learning_rate": 5.445843903969854e-06, "loss": -0.0188, "num_tokens": 183362849.0, "reward": 0.38659247756004333, "reward_std": 0.04423471912741661, "rewards/gemini_judge_reward_func/mean": 0.0691964253783226, "rewards/gemini_judge_reward_func/std": 0.1560075730085373, "rewards/semantic_correctness_reward_func/mean": 0.40031924843788147, "rewards/semantic_correctness_reward_func/std": 0.18306925892829895, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903892993927, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 169.09375, "completions/mean_terminated_length": 161.3918914794922, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.1758354316930562, "grad_norm": 0.020533520728349686, "kl": 0.01604938507080078, "learning_rate": 5.429372663441086e-06, "loss": -0.0012, "num_tokens": 183731974.0, "reward": 0.4428136944770813, "reward_std": 0.07406413555145264, "rewards/gemini_judge_reward_func/mean": 0.1439732164144516, "rewards/gemini_judge_reward_func/std": 0.26507771015167236, "rewards/semantic_correctness_reward_func/mean": 0.4514612555503845, "rewards/semantic_correctness_reward_func/std": 0.20401346683502197, "rewards/xmlcount_reward_func/mean": 0.737330436706543, "rewards/xmlcount_reward_func/std": 0.439359575510025, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 162.94644165039062, "completions/mean_terminated_length": 143.28765869140625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.17617685971576116, "grad_norm": 0.02042176015675068, "kl": 0.01526498794555664, "learning_rate": 5.412896727361663e-06, "loss": 0.0011, "num_tokens": 184076814.0, "reward": 0.42131778597831726, "reward_std": 0.06919633597135544, "rewards/gemini_judge_reward_func/mean": 0.1484375, "rewards/gemini_judge_reward_func/std": 0.22710849344730377, "rewards/semantic_correctness_reward_func/mean": 0.4512138366699219, "rewards/semantic_correctness_reward_func/std": 0.19391006231307983, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 159.17857360839844, "completions/mean_terminated_length": 147.4389190673828, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.17651828773846615, "grad_norm": 0.02078302949666977, "kl": 0.014979839324951172, "learning_rate": 5.396416275909779e-06, "loss": 0.0254, "num_tokens": 184454794.0, "reward": 0.4277709722518921, "reward_std": 0.06490686535835266, "rewards/gemini_judge_reward_func/mean": 0.1104910746216774, "rewards/gemini_judge_reward_func/std": 0.21258442103862762, "rewards/semantic_correctness_reward_func/mean": 0.404167115688324, "rewards/semantic_correctness_reward_func/std": 0.19909650087356567, "rewards/xmlcount_reward_func/mean": 0.7568526864051819, "rewards/xmlcount_reward_func/std": 0.42892637848854065, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 163.63839721679688, "completions/mean_terminated_length": 151.95928955078125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.1768597157611711, "grad_norm": 0.020726703107357025, "kl": 0.017206192016601562, "learning_rate": 5.379931489313016e-06, "loss": 0.0103, "num_tokens": 184798381.0, "reward": 0.44928938150405884, "reward_std": 0.058755356818437576, "rewards/gemini_judge_reward_func/mean": 0.1316964328289032, "rewards/gemini_judge_reward_func/std": 0.28005409240722656, "rewards/semantic_correctness_reward_func/mean": 0.44580385088920593, "rewards/semantic_correctness_reward_func/std": 0.22318150103092194, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 158.07589721679688, "completions/mean_terminated_length": 138.30592346191406, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.17720114378387605, "grad_norm": 0.02273509092628956, "kl": 0.017033815383911133, "learning_rate": 5.363442547846356e-06, "loss": -0.0056, "num_tokens": 185169506.0, "reward": 0.4369128942489624, "reward_std": 0.08179816603660583, "rewards/gemini_judge_reward_func/mean": 0.1517857164144516, "rewards/gemini_judge_reward_func/std": 0.28283703327178955, "rewards/semantic_correctness_reward_func/mean": 0.45099279284477234, "rewards/semantic_correctness_reward_func/std": 0.21851813793182373, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 167.00894165039062, "completions/mean_terminated_length": 155.37557983398438, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.17754257180658103, "grad_norm": 0.02012968808412552, "kl": 0.015107154846191406, "learning_rate": 5.346949631830221e-06, "loss": -0.0077, "num_tokens": 185521656.0, "reward": 0.434848815202713, "reward_std": 0.07218959927558899, "rewards/gemini_judge_reward_func/mean": 0.140625, "rewards/gemini_judge_reward_func/std": 0.25888094305992126, "rewards/semantic_correctness_reward_func/mean": 0.4599758982658386, "rewards/semantic_correctness_reward_func/std": 0.19838641583919525, "rewards/xmlcount_reward_func/mean": 0.7165089845657349, "rewards/xmlcount_reward_func/std": 0.45138630270957947, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 180.83482360839844, "completions/mean_terminated_length": 149.60647583007812, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.177883999829286, "grad_norm": 0.021219704300165176, "kl": 0.01463770866394043, "learning_rate": 5.3304529216284974e-06, "loss": -0.0383, "num_tokens": 185866755.0, "reward": 0.4066275954246521, "reward_std": 0.0540161170065403, "rewards/gemini_judge_reward_func/mean": 0.1037946417927742, "rewards/gemini_judge_reward_func/std": 0.20530273020267487, "rewards/semantic_correctness_reward_func/mean": 0.43129876255989075, "rewards/semantic_correctness_reward_func/std": 0.20759737491607666, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903892993927, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 184.32589721679688, "completions/mean_terminated_length": 161.21559143066406, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.17822542785199094, "grad_norm": 0.020118800923228264, "kl": 0.014031648635864258, "learning_rate": 5.3139525976465675e-06, "loss": -0.0155, "num_tokens": 186255640.0, "reward": 0.4363122284412384, "reward_std": 0.056919172406196594, "rewards/gemini_judge_reward_func/mean": 0.1428571492433548, "rewards/gemini_judge_reward_func/std": 0.24879594147205353, "rewards/semantic_correctness_reward_func/mean": 0.4300965964794159, "rewards/semantic_correctness_reward_func/std": 0.21386457979679108, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 157.47769165039062, "completions/mean_terminated_length": 153.59193420410156, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.17856685587469592, "grad_norm": 0.01910630241036415, "kl": 0.01791095733642578, "learning_rate": 5.2974488403293285e-06, "loss": -0.0158, "num_tokens": 186620139.0, "reward": 0.4440222680568695, "reward_std": 0.06990361213684082, "rewards/gemini_judge_reward_func/mean": 0.1696428507566452, "rewards/gemini_judge_reward_func/std": 0.2913442552089691, "rewards/semantic_correctness_reward_func/mean": 0.48657554388046265, "rewards/semantic_correctness_reward_func/std": 0.2125052511692047, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903892993927, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 1024.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 179.77679443359375, "completions/mean_terminated_length": 148.50926208496094, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.17890828389740088, "grad_norm": 0.019049007445573807, "kl": 0.015116214752197266, "learning_rate": 5.280941830159228e-06, "loss": -0.0286, "num_tokens": 187000085.0, "reward": 0.45324477553367615, "reward_std": 0.07729536294937134, "rewards/gemini_judge_reward_func/mean": 0.1729910671710968, "rewards/gemini_judge_reward_func/std": 0.29080912470817566, "rewards/semantic_correctness_reward_func/mean": 0.49024146795272827, "rewards/semantic_correctness_reward_func/std": 0.21569402515888214, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 160.98214721679688, "completions/mean_terminated_length": 149.2669677734375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.17924971192010583, "grad_norm": 0.020367106422781944, "kl": 0.016560077667236328, "learning_rate": 5.264431747654284e-06, "loss": 0.0176, "num_tokens": 187355309.0, "reward": 0.42137518525123596, "reward_std": 0.0671260803937912, "rewards/gemini_judge_reward_func/mean": 0.1529017835855484, "rewards/gemini_judge_reward_func/std": 0.25565895438194275, "rewards/semantic_correctness_reward_func/mean": 0.43138477206230164, "rewards/semantic_correctness_reward_func/std": 0.203949972987175, "rewards/xmlcount_reward_func/mean": 0.6848437190055847, "rewards/xmlcount_reward_func/std": 0.46544113755226135, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 156.57144165039062, "completions/mean_terminated_length": 144.79638671875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.1795911399428108, "grad_norm": 0.02240682952105999, "kl": 0.013727188110351562, "learning_rate": 5.247918773366112e-06, "loss": -0.0034, "num_tokens": 187717201.0, "reward": 0.44608941674232483, "reward_std": 0.07263628393411636, "rewards/gemini_judge_reward_func/mean": 0.1383928507566452, "rewards/gemini_judge_reward_func/std": 0.26116180419921875, "rewards/semantic_correctness_reward_func/mean": 0.4700183868408203, "rewards/semantic_correctness_reward_func/std": 0.23790188133716583, "rewards/xmlcount_reward_func/mean": 0.7418214082717896, "rewards/xmlcount_reward_func/std": 0.4343191385269165, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 171.1919708251953, "completions/mean_terminated_length": 151.72145080566406, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.17993256796551577, "grad_norm": 0.020298024639487267, "kl": 0.012287616729736328, "learning_rate": 5.231403087877955e-06, "loss": 0.0, "num_tokens": 188043636.0, "reward": 0.5017148852348328, "reward_std": 0.05507539212703705, "rewards/gemini_judge_reward_func/mean": 0.1930803507566452, "rewards/gemini_judge_reward_func/std": 0.29825180768966675, "rewards/semantic_correctness_reward_func/mean": 0.4779132306575775, "rewards/semantic_correctness_reward_func/std": 0.2128426432609558, "rewards/xmlcount_reward_func/mean": 0.8222500681877136, "rewards/xmlcount_reward_func/std": 0.3842346966266632, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 186.10714721679688, "completions/mean_terminated_length": 159.07833862304688, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.18027399598822072, "grad_norm": 0.02056858502328396, "kl": 0.015903472900390625, "learning_rate": 5.214884871802703e-06, "loss": -0.0112, "num_tokens": 188390548.0, "reward": 0.42851802706718445, "reward_std": 0.06244615837931633, "rewards/gemini_judge_reward_func/mean": 0.1328125, "rewards/gemini_judge_reward_func/std": 0.24931590259075165, "rewards/semantic_correctness_reward_func/mean": 0.4469650685787201, "rewards/semantic_correctness_reward_func/std": 0.2338939607143402, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 164.9553680419922, "completions/mean_terminated_length": 149.33636474609375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.1806154240109257, "grad_norm": 0.01978747919201851, "kl": 0.01563262939453125, "learning_rate": 5.198364305780922e-06, "loss": -0.0333, "num_tokens": 188762902.0, "reward": 0.4285474121570587, "reward_std": 0.06751622259616852, "rewards/gemini_judge_reward_func/mean": 0.1205357164144516, "rewards/gemini_judge_reward_func/std": 0.2482723742723465, "rewards/semantic_correctness_reward_func/mean": 0.4270046353340149, "rewards/semantic_correctness_reward_func/std": 0.20151624083518982, "rewards/xmlcount_reward_func/mean": 0.737330436706543, "rewards/xmlcount_reward_func/std": 0.4393596053123474, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 152.0625, "completions/mean_terminated_length": 148.15248107910156, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.18095685203363066, "grad_norm": 0.020898984745144844, "kl": 0.01631021499633789, "learning_rate": 5.1818415704788725e-06, "loss": -0.0143, "num_tokens": 189098264.0, "reward": 0.41772809624671936, "reward_std": 0.05771636217832565, "rewards/gemini_judge_reward_func/mean": 0.0982142835855484, "rewards/gemini_judge_reward_func/std": 0.19482412934303284, "rewards/semantic_correctness_reward_func/mean": 0.39071187376976013, "rewards/semantic_correctness_reward_func/std": 0.19836723804473877, "rewards/xmlcount_reward_func/mean": 0.7507500648498535, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 148.0178680419922, "completions/mean_terminated_length": 136.126708984375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.18129828005633564, "grad_norm": 0.02229609526693821, "kl": 0.016646385192871094, "learning_rate": 5.165316846586541e-06, "loss": -0.0295, "num_tokens": 189447896.0, "reward": 0.4175770878791809, "reward_std": 0.07173692435026169, "rewards/gemini_judge_reward_func/mean": 0.1261160671710968, "rewards/gemini_judge_reward_func/std": 0.244900643825531, "rewards/semantic_correctness_reward_func/mean": 0.4398496448993683, "rewards/semantic_correctness_reward_func/std": 0.2049618512392044, "rewards/xmlcount_reward_func/mean": 0.6979018449783325, "rewards/xmlcount_reward_func/std": 0.45866456627845764, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 167.4375, "completions/mean_terminated_length": 151.86363220214844, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1816397080790406, "grad_norm": 0.022729597985744476, "kl": 0.016128063201904297, "learning_rate": 5.148790314815662e-06, "loss": -0.0083, "num_tokens": 189805810.0, "reward": 0.40617290139198303, "reward_std": 0.08018369972705841, "rewards/gemini_judge_reward_func/mean": 0.1462053507566452, "rewards/gemini_judge_reward_func/std": 0.2627832591533661, "rewards/semantic_correctness_reward_func/mean": 0.4157036244869232, "rewards/semantic_correctness_reward_func/std": 0.19158992171287537, "rewards/xmlcount_reward_func/mean": 0.6613750457763672, "rewards/xmlcount_reward_func/std": 0.47500187158584595, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 150.22769165039062, "completions/mean_terminated_length": 142.35586547851562, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.18198113610174554, "grad_norm": 0.021641992032527924, "kl": 0.016852378845214844, "learning_rate": 5.132262155897739e-06, "loss": 0.0189, "num_tokens": 190160581.0, "reward": 0.4324653148651123, "reward_std": 0.07647126168012619, "rewards/gemini_judge_reward_func/mean": 0.1473214328289032, "rewards/gemini_judge_reward_func/std": 0.294861763715744, "rewards/semantic_correctness_reward_func/mean": 0.4376834034919739, "rewards/semantic_correctness_reward_func/std": 0.23118866980075836, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 152.13394165039062, "completions/mean_terminated_length": 144.27928161621094, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.18232256412445053, "grad_norm": 0.020904729142785072, "kl": 0.017549514770507812, "learning_rate": 5.11573255058207e-06, "loss": 0.0034, "num_tokens": 190511795.0, "reward": 0.4001460373401642, "reward_std": 0.05920318514108658, "rewards/gemini_judge_reward_func/mean": 0.0881696417927742, "rewards/gemini_judge_reward_func/std": 0.19316980242729187, "rewards/semantic_correctness_reward_func/mean": 0.394390732049942, "rewards/semantic_correctness_reward_func/std": 0.19004258513450623, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 157.79464721679688, "completions/mean_terminated_length": 153.9103240966797, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.18266399214715548, "grad_norm": 0.020164046436548233, "kl": 0.01853799819946289, "learning_rate": 5.099201679633769e-06, "loss": 0.0122, "num_tokens": 190839637.0, "reward": 0.4808293879032135, "reward_std": 0.07350355386734009, "rewards/gemini_judge_reward_func/mean": 0.1696428507566452, "rewards/gemini_judge_reward_func/std": 0.3008103370666504, "rewards/semantic_correctness_reward_func/mean": 0.4561111032962799, "rewards/semantic_correctness_reward_func/std": 0.250588595867157, "rewards/xmlcount_reward_func/mean": 0.8043751120567322, "rewards/xmlcount_reward_func/std": 0.39858436584472656, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 162.5982208251953, "completions/mean_terminated_length": 138.88990783691406, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.18300542016986043, "grad_norm": 0.020656241104006767, "kl": 0.015102148056030273, "learning_rate": 5.082669723831793e-06, "loss": -0.0378, "num_tokens": 191234363.0, "reward": 0.3954419493675232, "reward_std": 0.052222900092601776, "rewards/gemini_judge_reward_func/mean": 0.1540178507566452, "rewards/gemini_judge_reward_func/std": 0.27251842617988586, "rewards/semantic_correctness_reward_func/mean": 0.4179239869117737, "rewards/semantic_correctness_reward_func/std": 0.2332126796245575, "rewards/xmlcount_reward_func/mean": 0.6256250739097595, "rewards/xmlcount_reward_func/std": 0.48569241166114807, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 143.3482208251953, "completions/mean_terminated_length": 143.3482208251953, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.18334684819256541, "grad_norm": 0.02267182618379593, "kl": 0.017610549926757812, "learning_rate": 5.066136863966963e-06, "loss": -0.0024, "num_tokens": 191593785.0, "reward": 0.43893885612487793, "reward_std": 0.055210184305906296, "rewards/gemini_judge_reward_func/mean": 0.1484375, "rewards/gemini_judge_reward_func/std": 0.25169339776039124, "rewards/semantic_correctness_reward_func/mean": 0.4320690929889679, "rewards/semantic_correctness_reward_func/std": 0.2041468471288681, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 915.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 145.40179443359375, "completions/mean_terminated_length": 145.40179443359375, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.18368827621527037, "grad_norm": 0.023711949586868286, "kl": 0.017864704132080078, "learning_rate": 5.049603280839982e-06, "loss": -0.0087, "num_tokens": 191961767.0, "reward": 0.4316798150539398, "reward_std": 0.07676589488983154, "rewards/gemini_judge_reward_func/mean": 0.1584821492433548, "rewards/gemini_judge_reward_func/std": 0.2957007586956024, "rewards/semantic_correctness_reward_func/mean": 0.4338454306125641, "rewards/semantic_correctness_reward_func/std": 0.2067124843597412, "rewards/xmlcount_reward_func/mean": 0.7037946581840515, "rewards/xmlcount_reward_func/std": 0.4546814262866974, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 166.58929443359375, "completions/mean_terminated_length": 147.01368713378906, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.18402970423797532, "grad_norm": 0.020993690937757492, "kl": 0.015955448150634766, "learning_rate": 5.033069155259471e-06, "loss": 0.0114, "num_tokens": 192308643.0, "reward": 0.44476279616355896, "reward_std": 0.08589890599250793, "rewards/gemini_judge_reward_func/mean": 0.1662946492433548, "rewards/gemini_judge_reward_func/std": 0.28207293152809143, "rewards/semantic_correctness_reward_func/mean": 0.4254744052886963, "rewards/semantic_correctness_reward_func/std": 0.21944975852966309, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 161.5982208251953, "completions/mean_terminated_length": 141.90867614746094, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.1843711322606803, "grad_norm": 0.021806977689266205, "kl": 0.018055438995361328, "learning_rate": 5.016534668039976e-06, "loss": 0.0092, "num_tokens": 192682705.0, "reward": 0.3981616199016571, "reward_std": 0.07369980216026306, "rewards/gemini_judge_reward_func/mean": 0.1361607164144516, "rewards/gemini_judge_reward_func/std": 0.24234557151794434, "rewards/semantic_correctness_reward_func/mean": 0.4368884265422821, "rewards/semantic_correctness_reward_func/std": 0.20789456367492676, "rewards/xmlcount_reward_func/mean": 0.6407991051673889, "rewards/xmlcount_reward_func/std": 0.47512152791023254, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 161.8482208251953, "completions/mean_terminated_length": 142.1643829345703, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.18471256028338526, "grad_norm": 0.02082211524248123, "kl": 0.016353130340576172, "learning_rate": 5e-06, "loss": 0.0069, "num_tokens": 193064427.0, "reward": 0.377638041973114, "reward_std": 0.048613447695970535, "rewards/gemini_judge_reward_func/mean": 0.1294642835855484, "rewards/gemini_judge_reward_func/std": 0.24939870834350586, "rewards/semantic_correctness_reward_func/mean": 0.434475839138031, "rewards/semantic_correctness_reward_func/std": 0.22424183785915375, "rewards/xmlcount_reward_func/mean": 0.5973929166793823, "rewards/xmlcount_reward_func/std": 0.48817571997642517, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 151.7544708251953, "completions/mean_terminated_length": 147.8430633544922, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.1850539883060902, "grad_norm": 0.023179858922958374, "kl": 0.018024682998657227, "learning_rate": 4.983465331960025e-06, "loss": -0.034, "num_tokens": 193415356.0, "reward": 0.4562413692474365, "reward_std": 0.06751704216003418, "rewards/gemini_judge_reward_func/mean": 0.1294642835855484, "rewards/gemini_judge_reward_func/std": 0.24140411615371704, "rewards/semantic_correctness_reward_func/mean": 0.4492780268192291, "rewards/semantic_correctness_reward_func/std": 0.21333856880664825, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 166.71429443359375, "completions/mean_terminated_length": 147.14154052734375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.1853954163287952, "grad_norm": 0.019812671467661858, "kl": 0.015459537506103516, "learning_rate": 4.96693084474053e-06, "loss": 0.0041, "num_tokens": 193767172.0, "reward": 0.4424658417701721, "reward_std": 0.07584776729345322, "rewards/gemini_judge_reward_func/mean": 0.1361607164144516, "rewards/gemini_judge_reward_func/std": 0.2457905411720276, "rewards/semantic_correctness_reward_func/mean": 0.4385075569152832, "rewards/semantic_correctness_reward_func/std": 0.21686992049217224, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 166.4553680419922, "completions/mean_terminated_length": 154.81448364257812, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.18573684435150015, "grad_norm": 0.020016420632600784, "kl": 0.011565446853637695, "learning_rate": 4.950396719160019e-06, "loss": -0.0232, "num_tokens": 194111730.0, "reward": 0.4828590750694275, "reward_std": 0.06606737524271011, "rewards/gemini_judge_reward_func/mean": 0.1428571492433548, "rewards/gemini_judge_reward_func/std": 0.27038896083831787, "rewards/semantic_correctness_reward_func/mean": 0.43940237164497375, "rewards/semantic_correctness_reward_func/std": 0.20458753407001495, "rewards/xmlcount_reward_func/mean": 0.844589352607727, "rewards/xmlcount_reward_func/std": 0.3580341935157776, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 153.55357360839844, "completions/mean_terminated_length": 137.72726440429688, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.1860782723742051, "grad_norm": 0.02174554578959942, "kl": 0.018509387969970703, "learning_rate": 4.93386313603304e-06, "loss": -0.0003, "num_tokens": 194440186.0, "reward": 0.47312822937965393, "reward_std": 0.06163394823670387, "rewards/gemini_judge_reward_func/mean": 0.1473214328289032, "rewards/gemini_judge_reward_func/std": 0.23450541496276855, "rewards/semantic_correctness_reward_func/mean": 0.4622481167316437, "rewards/semantic_correctness_reward_func/std": 0.23137818276882172, "rewards/xmlcount_reward_func/mean": 0.8043750524520874, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 155.45089721679688, "completions/mean_terminated_length": 135.62100219726562, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.18641970039691008, "grad_norm": 0.020509352907538414, "kl": 0.016957759857177734, "learning_rate": 4.917330276168208e-06, "loss": 0.0021, "num_tokens": 194803295.0, "reward": 0.4079745411872864, "reward_std": 0.06811019033193588, "rewards/gemini_judge_reward_func/mean": 0.0993303582072258, "rewards/gemini_judge_reward_func/std": 0.2218693345785141, "rewards/semantic_correctness_reward_func/mean": 0.44696179032325745, "rewards/semantic_correctness_reward_func/std": 0.19916366040706635, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903594970703, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 157.58482360839844, "completions/mean_terminated_length": 141.83181762695312, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.18676112841961504, "grad_norm": 0.022121498361229897, "kl": 0.02248358726501465, "learning_rate": 4.900798320366233e-06, "loss": -0.0047, "num_tokens": 195182922.0, "reward": 0.4160357713699341, "reward_std": 0.05240814387798309, "rewards/gemini_judge_reward_func/mean": 0.0926339253783226, "rewards/gemini_judge_reward_func/std": 0.2235099822282791, "rewards/semantic_correctness_reward_func/mean": 0.42916086316108704, "rewards/semantic_correctness_reward_func/std": 0.188674196600914, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 156.20982360839844, "completions/mean_terminated_length": 148.3918914794922, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.18710255644232002, "grad_norm": 0.02068450301885605, "kl": 0.018212318420410156, "learning_rate": 4.884267449417932e-06, "loss": 0.0021, "num_tokens": 195525961.0, "reward": 0.4415396451950073, "reward_std": 0.06090681999921799, "rewards/gemini_judge_reward_func/mean": 0.1339285671710968, "rewards/gemini_judge_reward_func/std": 0.2342064529657364, "rewards/semantic_correctness_reward_func/mean": 0.4383408725261688, "rewards/semantic_correctness_reward_func/std": 0.1995237171649933, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 149.54464721679688, "completions/mean_terminated_length": 145.62332153320312, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.18744398446502497, "grad_norm": 0.020279210060834885, "kl": 0.014973640441894531, "learning_rate": 4.867737844102261e-06, "loss": -0.0146, "num_tokens": 195887403.0, "reward": 0.44515714049339294, "reward_std": 0.05280788615345955, "rewards/gemini_judge_reward_func/mean": 0.1127232164144516, "rewards/gemini_judge_reward_func/std": 0.22175651788711548, "rewards/semantic_correctness_reward_func/mean": 0.4273391664028168, "rewards/semantic_correctness_reward_func/std": 0.21332186460494995, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 166.63839721679688, "completions/mean_terminated_length": 155.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.18778541248772992, "grad_norm": 0.019517678767442703, "kl": 0.012271404266357422, "learning_rate": 4.851209685184339e-06, "loss": 0.0063, "num_tokens": 196253170.0, "reward": 0.4689396023750305, "reward_std": 0.08325158804655075, "rewards/gemini_judge_reward_func/mean": 0.1919642835855484, "rewards/gemini_judge_reward_func/std": 0.29944294691085815, "rewards/semantic_correctness_reward_func/mean": 0.4682067334651947, "rewards/semantic_correctness_reward_func/std": 0.24702706933021545, "rewards/xmlcount_reward_func/mean": 0.7462812662124634, "rewards/xmlcount_reward_func/std": 0.4369716942310333, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 164.35269165039062, "completions/mean_terminated_length": 144.7260284423828, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.1881268405104349, "grad_norm": 0.021506622433662415, "kl": 0.014761686325073242, "learning_rate": 4.8346831534134595e-06, "loss": -0.0001, "num_tokens": 196590861.0, "reward": 0.4275391399860382, "reward_std": 0.07103725522756577, "rewards/gemini_judge_reward_func/mean": 0.1361607164144516, "rewards/gemini_judge_reward_func/std": 0.25474947690963745, "rewards/semantic_correctness_reward_func/mean": 0.3996242582798004, "rewards/semantic_correctness_reward_func/std": 0.20673991739749908, "rewards/xmlcount_reward_func/mean": 0.7328750491142273, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 156.5178680419922, "completions/mean_terminated_length": 144.74208068847656, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.18846826853313986, "grad_norm": 0.022670861333608627, "kl": 0.017201900482177734, "learning_rate": 4.818158429521129e-06, "loss": -0.0106, "num_tokens": 196951165.0, "reward": 0.4059324860572815, "reward_std": 0.05520794540643692, "rewards/gemini_judge_reward_func/mean": 0.0959821417927742, "rewards/gemini_judge_reward_func/std": 0.18865090608596802, "rewards/semantic_correctness_reward_func/mean": 0.45956405997276306, "rewards/semantic_correctness_reward_func/std": 0.21358434855937958, "rewards/xmlcount_reward_func/mean": 0.689067006111145, "rewards/xmlcount_reward_func/std": 0.4582628309726715, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 758.0, "completions/mean_length": 158.15625, "completions/mean_terminated_length": 142.41363525390625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.1888096965558448, "grad_norm": 0.02014843560755253, "kl": 0.015711545944213867, "learning_rate": 4.801635694219079e-06, "loss": -0.0282, "num_tokens": 197304152.0, "reward": 0.4128032624721527, "reward_std": 0.06284855306148529, "rewards/gemini_judge_reward_func/mean": 0.109375, "rewards/gemini_judge_reward_func/std": 0.22530809044837952, "rewards/semantic_correctness_reward_func/mean": 0.4152660667896271, "rewards/semantic_correctness_reward_func/std": 0.2113528996706009, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 150.3928680419922, "completions/mean_terminated_length": 134.50909423828125, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.1891511245785498, "grad_norm": 0.024563709273934364, "kl": 0.020460128784179688, "learning_rate": 4.785115128197298e-06, "loss": -0.0112, "num_tokens": 197680252.0, "reward": 0.4380243420600891, "reward_std": 0.09170064330101013, "rewards/gemini_judge_reward_func/mean": 0.1640625, "rewards/gemini_judge_reward_func/std": 0.27535709738731384, "rewards/semantic_correctness_reward_func/mean": 0.443183958530426, "rewards/semantic_correctness_reward_func/std": 0.2359272539615631, "rewards/xmlcount_reward_func/mean": 0.709406316280365, "rewards/xmlcount_reward_func/std": 0.45243263244628906, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 161.38839721679688, "completions/mean_terminated_length": 149.67874145507812, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.18949255260125475, "grad_norm": 0.019087178632616997, "kl": 0.014681816101074219, "learning_rate": 4.768596912122046e-06, "loss": -0.031, "num_tokens": 198046059.0, "reward": 0.43682870268821716, "reward_std": 0.052002809941768646, "rewards/gemini_judge_reward_func/mean": 0.0770089253783226, "rewards/gemini_judge_reward_func/std": 0.17215459048748016, "rewards/semantic_correctness_reward_func/mean": 0.4213755428791046, "rewards/semantic_correctness_reward_func/std": 0.18203482031822205, "rewards/xmlcount_reward_func/mean": 0.8043750524520874, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044642857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 184.30357360839844, "completions/mean_terminated_length": 145.06541442871094, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.1898339806239597, "grad_norm": 0.019434470683336258, "kl": 0.013373613357543945, "learning_rate": 4.752081226633888e-06, "loss": -0.0156, "num_tokens": 198423511.0, "reward": 0.3853397071361542, "reward_std": 0.05791494622826576, "rewards/gemini_judge_reward_func/mean": 0.1450892835855484, "rewards/gemini_judge_reward_func/std": 0.2873225510120392, "rewards/semantic_correctness_reward_func/mean": 0.45006465911865234, "rewards/semantic_correctness_reward_func/std": 0.21861128509044647, "rewards/xmlcount_reward_func/mean": 0.5932276844978333, "rewards/xmlcount_reward_func/std": 0.49093925952911377, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 165.5, "completions/mean_terminated_length": 141.87155151367188, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.19017540864666468, "grad_norm": 0.01992631144821644, "kl": 0.01584911346435547, "learning_rate": 4.735568252345718e-06, "loss": -0.019, "num_tokens": 198796363.0, "reward": 0.3802779018878937, "reward_std": 0.050197649747133255, "rewards/gemini_judge_reward_func/mean": 0.1194196417927742, "rewards/gemini_judge_reward_func/std": 0.23904728889465332, "rewards/semantic_correctness_reward_func/mean": 0.43720176815986633, "rewards/semantic_correctness_reward_func/std": 0.17973925173282623, "rewards/xmlcount_reward_func/mean": 0.6126741766929626, "rewards/xmlcount_reward_func/std": 0.48539677262306213, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 157.01339721679688, "completions/mean_terminated_length": 145.24435424804688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.19051683666936964, "grad_norm": 0.021687161177396774, "kl": 0.020299911499023438, "learning_rate": 4.719058169840773e-06, "loss": 0.015, "num_tokens": 199166094.0, "reward": 0.42609551548957825, "reward_std": 0.06808494031429291, "rewards/gemini_judge_reward_func/mean": 0.1194196417927742, "rewards/gemini_judge_reward_func/std": 0.2378719598054886, "rewards/semantic_correctness_reward_func/mean": 0.4080309271812439, "rewards/semantic_correctness_reward_func/std": 0.2078658640384674, "rewards/xmlcount_reward_func/mean": 0.7418035864830017, "rewards/xmlcount_reward_func/std": 0.43432918190956116, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 848.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 146.4241180419922, "completions/mean_terminated_length": 146.4241180419922, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.1908582646920746, "grad_norm": 0.020877644419670105, "kl": 0.01645803451538086, "learning_rate": 4.702551159670672e-06, "loss": -0.0548, "num_tokens": 199510785.0, "reward": 0.45165494084358215, "reward_std": 0.07315388321876526, "rewards/gemini_judge_reward_func/mean": 0.1439732164144516, "rewards/gemini_judge_reward_func/std": 0.24185720086097717, "rewards/semantic_correctness_reward_func/mean": 0.433077871799469, "rewards/semantic_correctness_reward_func/std": 0.2338293194770813, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 141.375, "completions/mean_terminated_length": 141.375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.19119969271477957, "grad_norm": 0.02385268360376358, "kl": 0.014994382858276367, "learning_rate": 4.686047402353433e-06, "loss": 0.0111, "num_tokens": 199862417.0, "reward": 0.47651809453964233, "reward_std": 0.06831540167331696, "rewards/gemini_judge_reward_func/mean": 0.1618303507566452, "rewards/gemini_judge_reward_func/std": 0.29813432693481445, "rewards/semantic_correctness_reward_func/mean": 0.4501795768737793, "rewards/semantic_correctness_reward_func/std": 0.22495129704475403, "rewards/xmlcount_reward_func/mean": 0.8043751120567322, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 156.93304443359375, "completions/mean_terminated_length": 149.1216278076172, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.19154112073748453, "grad_norm": 0.022106066346168518, "kl": 0.015239715576171875, "learning_rate": 4.669547078371503e-06, "loss": -0.0075, "num_tokens": 200199170.0, "reward": 0.4726813733577728, "reward_std": 0.06441599130630493, "rewards/gemini_judge_reward_func/mean": 0.1674107164144516, "rewards/gemini_judge_reward_func/std": 0.26019221544265747, "rewards/semantic_correctness_reward_func/mean": 0.4555852711200714, "rewards/semantic_correctness_reward_func/std": 0.21719810366630554, "rewards/xmlcount_reward_func/mean": 0.786500096321106, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 169.24107360839844, "completions/mean_terminated_length": 149.7260284423828, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.19188254876018948, "grad_norm": 0.019646869972348213, "kl": 0.012475013732910156, "learning_rate": 4.65305036816978e-06, "loss": -0.0089, "num_tokens": 200528212.0, "reward": 0.4791014790534973, "reward_std": 0.06499748677015305, "rewards/gemini_judge_reward_func/mean": 0.1417410671710968, "rewards/gemini_judge_reward_func/std": 0.24661129713058472, "rewards/semantic_correctness_reward_func/mean": 0.4317750036716461, "rewards/semantic_correctness_reward_func/std": 0.2213517427444458, "rewards/xmlcount_reward_func/mean": 0.8401250839233398, "rewards/xmlcount_reward_func/std": 0.3684578835964203, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 168.69644165039062, "completions/mean_terminated_length": 141.10598754882812, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.19222397678289446, "grad_norm": 0.020795688033103943, "kl": 0.012841224670410156, "learning_rate": 4.636557452153645e-06, "loss": -0.0124, "num_tokens": 200886308.0, "reward": 0.4488481879234314, "reward_std": 0.05293935909867287, "rewards/gemini_judge_reward_func/mean": 0.1283482164144516, "rewards/gemini_judge_reward_func/std": 0.24715863168239594, "rewards/semantic_correctness_reward_func/mean": 0.4502943158149719, "rewards/semantic_correctness_reward_func/std": 0.23604029417037964, "rewards/xmlcount_reward_func/mean": 0.7686249613761902, "rewards/xmlcount_reward_func/std": 0.42356839776039124, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 151.6116180419922, "completions/mean_terminated_length": 143.75225830078125, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.19256540480559942, "grad_norm": 0.020217539742588997, "kl": 0.01593160629272461, "learning_rate": 4.620068510686985e-06, "loss": -0.0059, "num_tokens": 201230369.0, "reward": 0.453321635723114, "reward_std": 0.06796573847532272, "rewards/gemini_judge_reward_func/mean": 0.1372767835855484, "rewards/gemini_judge_reward_func/std": 0.24459391832351685, "rewards/semantic_correctness_reward_func/mean": 0.4190545082092285, "rewards/semantic_correctness_reward_func/std": 0.19237849116325378, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 170.2678680419922, "completions/mean_terminated_length": 142.7281036376953, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.19290683282830437, "grad_norm": 0.0209835022687912, "kl": 0.01555776596069336, "learning_rate": 4.60358372409022e-06, "loss": -0.0456, "num_tokens": 201595869.0, "reward": 0.39094409346580505, "reward_std": 0.0595240518450737, "rewards/gemini_judge_reward_func/mean": 0.1037946417927742, "rewards/gemini_judge_reward_func/std": 0.22234255075454712, "rewards/semantic_correctness_reward_func/mean": 0.42662209272384644, "rewards/semantic_correctness_reward_func/std": 0.2272646129131317, "rewards/xmlcount_reward_func/mean": 0.6602544784545898, "rewards/xmlcount_reward_func/std": 0.4744928777217865, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044642857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 194.9419708251953, "completions/mean_terminated_length": 156.200927734375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.19324826085100935, "grad_norm": 0.021231811493635178, "kl": 0.014326095581054688, "learning_rate": 4.587103272638339e-06, "loss": -0.0311, "num_tokens": 201987592.0, "reward": 0.4055339992046356, "reward_std": 0.05433611199259758, "rewards/gemini_judge_reward_func/mean": 0.1127232164144516, "rewards/gemini_judge_reward_func/std": 0.2480076402425766, "rewards/semantic_correctness_reward_func/mean": 0.44372352957725525, "rewards/semantic_correctness_reward_func/std": 0.21770647168159485, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 145.13394165039062, "completions/mean_terminated_length": 145.13394165039062, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.1935896888737143, "grad_norm": 0.02045750990509987, "kl": 0.013641357421875, "learning_rate": 4.570627336558915e-06, "loss": 0.0053, "num_tokens": 202330202.0, "reward": 0.45321542024612427, "reward_std": 0.056673984974622726, "rewards/gemini_judge_reward_func/mean": 0.1328125, "rewards/gemini_judge_reward_func/std": 0.23184122145175934, "rewards/semantic_correctness_reward_func/mean": 0.4274519979953766, "rewards/semantic_correctness_reward_func/std": 0.22571316361427307, "rewards/xmlcount_reward_func/mean": 0.786500096321106, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 154.75, "completions/mean_terminated_length": 146.91893005371094, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.1939311168964193, "grad_norm": 0.022775255143642426, "kl": 0.01674032211303711, "learning_rate": 4.554156096030149e-06, "loss": -0.0144, "num_tokens": 202703858.0, "reward": 0.43068042397499084, "reward_std": 0.055178917944431305, "rewards/gemini_judge_reward_func/mean": 0.1261160671710968, "rewards/gemini_judge_reward_func/std": 0.25167351961135864, "rewards/semantic_correctness_reward_func/mean": 0.4555181860923767, "rewards/semantic_correctness_reward_func/std": 0.2062966674566269, "rewards/xmlcount_reward_func/mean": 0.7228259444236755, "rewards/xmlcount_reward_func/std": 0.4447035491466522, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 152.7544708251953, "completions/mean_terminated_length": 144.90541076660156, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.19427254491912424, "grad_norm": 0.02390287257730961, "kl": 0.0160367488861084, "learning_rate": 4.537689731178883e-06, "loss": -0.0266, "num_tokens": 203091631.0, "reward": 0.4275432527065277, "reward_std": 0.07999604940414429, "rewards/gemini_judge_reward_func/mean": 0.1584821492433548, "rewards/gemini_judge_reward_func/std": 0.2709713876247406, "rewards/semantic_correctness_reward_func/mean": 0.3907518982887268, "rewards/semantic_correctness_reward_func/std": 0.20916995406150818, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 154.55357360839844, "completions/mean_terminated_length": 142.7511444091797, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1946139729418292, "grad_norm": 0.022180769592523575, "kl": 0.014091014862060547, "learning_rate": 4.5212284220786495e-06, "loss": -0.034, "num_tokens": 203441075.0, "reward": 0.4240075349807739, "reward_std": 0.05866669490933418, "rewards/gemini_judge_reward_func/mean": 0.0959821417927742, "rewards/gemini_judge_reward_func/std": 0.19450274109840393, "rewards/semantic_correctness_reward_func/mean": 0.4265732169151306, "rewards/semantic_correctness_reward_func/std": 0.18987764418125153, "rewards/xmlcount_reward_func/mean": 0.7507500648498535, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 139.49107360839844, "completions/mean_terminated_length": 139.49107360839844, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.19495540096453418, "grad_norm": 0.0199284665286541, "kl": 0.013919830322265625, "learning_rate": 4.504772348747687e-06, "loss": -0.0205, "num_tokens": 203810761.0, "reward": 0.4262349307537079, "reward_std": 0.07754707336425781, "rewards/gemini_judge_reward_func/mean": 0.1495535671710968, "rewards/gemini_judge_reward_func/std": 0.2515864968299866, "rewards/semantic_correctness_reward_func/mean": 0.45570138096809387, "rewards/semantic_correctness_reward_func/std": 0.2322954386472702, "rewards/xmlcount_reward_func/mean": 0.6881831288337708, "rewards/xmlcount_reward_func/std": 0.4650120735168457, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 160.74107360839844, "completions/mean_terminated_length": 145.0454559326172, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.19529682898723913, "grad_norm": 0.020155394449830055, "kl": 0.016458988189697266, "learning_rate": 4.488321691146975e-06, "loss": 0.0005, "num_tokens": 204169107.0, "reward": 0.4155966341495514, "reward_std": 0.05478544905781746, "rewards/gemini_judge_reward_func/mean": 0.1194196417927742, "rewards/gemini_judge_reward_func/std": 0.22453764081001282, "rewards/semantic_correctness_reward_func/mean": 0.40914371609687805, "rewards/semantic_correctness_reward_func/std": 0.1803514063358307, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 145.82144165039062, "completions/mean_terminated_length": 137.909912109375, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.19563825700994408, "grad_norm": 0.021136289462447166, "kl": 0.021631717681884766, "learning_rate": 4.471876629178273e-06, "loss": -0.0114, "num_tokens": 204491755.0, "reward": 0.451036274433136, "reward_std": 0.06088856980204582, "rewards/gemini_judge_reward_func/mean": 0.1372767835855484, "rewards/gemini_judge_reward_func/std": 0.24459391832351685, "rewards/semantic_correctness_reward_func/mean": 0.43219009041786194, "rewards/semantic_correctness_reward_func/std": 0.2034158557653427, "rewards/xmlcount_reward_func/mean": 0.7742188572883606, "rewards/xmlcount_reward_func/std": 0.4162629544734955, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 163.20982360839844, "completions/mean_terminated_length": 147.55908203125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.19597968503264906, "grad_norm": 0.02188599295914173, "kl": 0.01761460304260254, "learning_rate": 4.4554373426821375e-06, "loss": 0.0214, "num_tokens": 204838910.0, "reward": 0.4334542155265808, "reward_std": 0.06716626137495041, "rewards/gemini_judge_reward_func/mean": 0.1305803507566452, "rewards/gemini_judge_reward_func/std": 0.2207612693309784, "rewards/semantic_correctness_reward_func/mean": 0.47611019015312195, "rewards/semantic_correctness_reward_func/std": 0.19524888694286346, "rewards/xmlcount_reward_func/mean": 0.7150000929832458, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 152.5178680419922, "completions/mean_terminated_length": 148.60987854003906, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.19632111305535402, "grad_norm": 0.027467962354421616, "kl": 0.02365851402282715, "learning_rate": 4.439004011435979e-06, "loss": 0.0124, "num_tokens": 205200686.0, "reward": 0.4516783654689789, "reward_std": 0.050351765006780624, "rewards/gemini_judge_reward_func/mean": 0.1651785671710968, "rewards/gemini_judge_reward_func/std": 0.3023702800273895, "rewards/semantic_correctness_reward_func/mean": 0.42674875259399414, "rewards/semantic_correctness_reward_func/std": 0.2154146283864975, "rewards/xmlcount_reward_func/mean": 0.7506428956985474, "rewards/xmlcount_reward_func/std": 0.4343574345111847, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 150.6607208251953, "completions/mean_terminated_length": 142.7928009033203, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.19666254107805897, "grad_norm": 0.021642692387104034, "kl": 0.014559745788574219, "learning_rate": 4.42257681515207e-06, "loss": 0.0003, "num_tokens": 205540602.0, "reward": 0.4592445492744446, "reward_std": 0.07840275019407272, "rewards/gemini_judge_reward_func/mean": 0.1305803507566452, "rewards/gemini_judge_reward_func/std": 0.24021686613559723, "rewards/semantic_correctness_reward_func/mean": 0.4620618224143982, "rewards/semantic_correctness_reward_func/std": 0.2147509902715683, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 144.47769165039062, "completions/mean_terminated_length": 136.55406188964844, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.19700396910076395, "grad_norm": 0.02344970405101776, "kl": 0.02066946029663086, "learning_rate": 4.406155933475599e-06, "loss": -0.0091, "num_tokens": 205913557.0, "reward": 0.4101777970790863, "reward_std": 0.07215236127376556, "rewards/gemini_judge_reward_func/mean": 0.1417410671710968, "rewards/gemini_judge_reward_func/std": 0.24887388944625854, "rewards/semantic_correctness_reward_func/mean": 0.4446566700935364, "rewards/semantic_correctness_reward_func/std": 0.19075852632522583, "rewards/xmlcount_reward_func/mean": 0.6613750457763672, "rewards/xmlcount_reward_func/std": 0.47500187158584595, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 166.50894165039062, "completions/mean_terminated_length": 146.93150329589844, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.1973453971234689, "grad_norm": 0.02164464257657528, "kl": 0.022466421127319336, "learning_rate": 4.3897415459827e-06, "loss": -0.0295, "num_tokens": 206277067.0, "reward": 0.40516912937164307, "reward_std": 0.06767025589942932, "rewards/gemini_judge_reward_func/mean": 0.1037946417927742, "rewards/gemini_judge_reward_func/std": 0.22359952330589294, "rewards/semantic_correctness_reward_func/mean": 0.45975613594055176, "rewards/semantic_correctness_reward_func/std": 0.20172879099845886, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 165.875, "completions/mean_terminated_length": 146.28309631347656, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.19768682514617386, "grad_norm": 0.02861608937382698, "kl": 0.021225690841674805, "learning_rate": 4.373333832178478e-06, "loss": -0.035, "num_tokens": 206632675.0, "reward": 0.39243432879447937, "reward_std": 0.04971366375684738, "rewards/gemini_judge_reward_func/mean": 0.0904017835855484, "rewards/gemini_judge_reward_func/std": 0.20755748450756073, "rewards/semantic_correctness_reward_func/mean": 0.422868013381958, "rewards/semantic_correctness_reward_func/std": 0.21492131054401398, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 159.9107208251953, "completions/mean_terminated_length": 152.12612915039062, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.19802825316887884, "grad_norm": 0.023097701370716095, "kl": 0.014588117599487305, "learning_rate": 4.356932971495071e-06, "loss": -0.0088, "num_tokens": 207001439.0, "reward": 0.4645816385746002, "reward_std": 0.06817057728767395, "rewards/gemini_judge_reward_func/mean": 0.1339285671710968, "rewards/gemini_judge_reward_func/std": 0.24588216841220856, "rewards/semantic_correctness_reward_func/mean": 0.4820510447025299, "rewards/semantic_correctness_reward_func/std": 0.20867471396923065, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 160.1428680419922, "completions/mean_terminated_length": 148.41629028320312, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.1983696811915838, "grad_norm": 0.02026437781751156, "kl": 0.019693374633789062, "learning_rate": 4.340539143289655e-06, "loss": -0.0289, "num_tokens": 207373963.0, "reward": 0.4335937798023224, "reward_std": 0.06907905638217926, "rewards/gemini_judge_reward_func/mean": 0.1428571492433548, "rewards/gemini_judge_reward_func/std": 0.24879594147205353, "rewards/semantic_correctness_reward_func/mean": 0.4522543251514435, "rewards/semantic_correctness_reward_func/std": 0.19972975552082062, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 157.0178680419922, "completions/mean_terminated_length": 149.20721435546875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.19871110921428875, "grad_norm": 0.020305583253502846, "kl": 0.014866828918457031, "learning_rate": 4.324152526842517e-06, "loss": -0.0239, "num_tokens": 207735511.0, "reward": 0.5174515247344971, "reward_std": 0.08982065320014954, "rewards/gemini_judge_reward_func/mean": 0.1941964328289032, "rewards/gemini_judge_reward_func/std": 0.2894052565097809, "rewards/semantic_correctness_reward_func/mean": 0.5186145901679993, "rewards/semantic_correctness_reward_func/std": 0.22194412350654602, "rewards/xmlcount_reward_func/mean": 0.8401250839233398, "rewards/xmlcount_reward_func/std": 0.3684578835964203, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 162.7678680419922, "completions/mean_terminated_length": 147.1090850830078, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.19905253723699373, "grad_norm": 0.020116182044148445, "kl": 0.017856121063232422, "learning_rate": 4.307773301355063e-06, "loss": -0.0238, "num_tokens": 208069891.0, "reward": 0.45708727836608887, "reward_std": 0.061787575483322144, "rewards/gemini_judge_reward_func/mean": 0.1328125, "rewards/gemini_judge_reward_func/std": 0.25487470626831055, "rewards/semantic_correctness_reward_func/mean": 0.4468111991882324, "rewards/semantic_correctness_reward_func/std": 0.21997879445552826, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 148.85714721679688, "completions/mean_terminated_length": 144.9327392578125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.19939396525969869, "grad_norm": 0.020018786191940308, "kl": 0.01651740074157715, "learning_rate": 4.291401645947879e-06, "loss": 0.0001, "num_tokens": 208416219.0, "reward": 0.43855661153793335, "reward_std": 0.05887799337506294, "rewards/gemini_judge_reward_func/mean": 0.125, "rewards/gemini_judge_reward_func/std": 0.23437733948230743, "rewards/semantic_correctness_reward_func/mean": 0.4412831664085388, "rewards/semantic_correctness_reward_func/std": 0.20886096358299255, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 158.47769165039062, "completions/mean_terminated_length": 142.74090576171875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.19973539328240367, "grad_norm": 0.01968037523329258, "kl": 0.013400793075561523, "learning_rate": 4.275037739658771e-06, "loss": -0.0413, "num_tokens": 208759250.0, "reward": 0.41233882308006287, "reward_std": 0.06944146752357483, "rewards/gemini_judge_reward_func/mean": 0.109375, "rewards/gemini_judge_reward_func/std": 0.22654861211776733, "rewards/semantic_correctness_reward_func/mean": 0.41294386982917786, "rewards/semantic_correctness_reward_func/std": 0.21011611819267273, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 163.94644165039062, "completions/mean_terminated_length": 148.30908203125, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.20007682130510862, "grad_norm": 0.020078567788004875, "kl": 0.016144275665283203, "learning_rate": 4.25868176144079e-06, "loss": -0.0363, "num_tokens": 209127002.0, "reward": 0.3943597078323364, "reward_std": 0.06699073314666748, "rewards/gemini_judge_reward_func/mean": 0.1283482164144516, "rewards/gemini_judge_reward_func/std": 0.23073764145374298, "rewards/semantic_correctness_reward_func/mean": 0.4281020760536194, "rewards/semantic_correctness_reward_func/std": 0.186894953250885, "rewards/xmlcount_reward_func/mean": 0.643500030040741, "rewards/xmlcount_reward_func/std": 0.48071083426475525, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 158.75, "completions/mean_terminated_length": 147.00453186035156, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.20041824932781357, "grad_norm": 0.01927877590060234, "kl": 0.015355587005615234, "learning_rate": 4.242333890160299e-06, "loss": -0.0106, "num_tokens": 209469686.0, "reward": 0.43422627449035645, "reward_std": 0.05323384702205658, "rewards/gemini_judge_reward_func/mean": 0.0982142835855484, "rewards/gemini_judge_reward_func/std": 0.1844794601202011, "rewards/semantic_correctness_reward_func/mean": 0.4017024338245392, "rewards/semantic_correctness_reward_func/std": 0.2130288928747177, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 157.25, "completions/mean_terminated_length": 145.4841766357422, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.20075967735051856, "grad_norm": 0.022314567118883133, "kl": 0.02434396743774414, "learning_rate": 4.225994304594994e-06, "loss": 0.0151, "num_tokens": 209832814.0, "reward": 0.4395473003387451, "reward_std": 0.06388600915670395, "rewards/gemini_judge_reward_func/mean": 0.1540178507566452, "rewards/gemini_judge_reward_func/std": 0.29044100642204285, "rewards/semantic_correctness_reward_func/mean": 0.4597005844116211, "rewards/semantic_correctness_reward_func/std": 0.22340041399002075, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 154.54464721679688, "completions/mean_terminated_length": 138.73635864257812, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.2011011053732235, "grad_norm": 0.022513817995786667, "kl": 0.018211841583251953, "learning_rate": 4.209663183431969e-06, "loss": -0.0298, "num_tokens": 210184528.0, "reward": 0.4027530252933502, "reward_std": 0.05283074453473091, "rewards/gemini_judge_reward_func/mean": 0.0959821417927742, "rewards/gemini_judge_reward_func/std": 0.20018360018730164, "rewards/semantic_correctness_reward_func/mean": 0.3873276114463806, "rewards/semantic_correctness_reward_func/std": 0.18129631876945496, "rewards/xmlcount_reward_func/mean": 0.7172366380691528, "rewards/xmlcount_reward_func/std": 0.45090451836586, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 161.55804443359375, "completions/mean_terminated_length": 157.6905975341797, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.20144253339592846, "grad_norm": 0.02004176191985607, "kl": 0.014394760131835938, "learning_rate": 4.193340705265746e-06, "loss": -0.0408, "num_tokens": 210528789.0, "reward": 0.45304736495018005, "reward_std": 0.059848908334970474, "rewards/gemini_judge_reward_func/mean": 0.1272321492433548, "rewards/gemini_judge_reward_func/std": 0.23075121641159058, "rewards/semantic_correctness_reward_func/mean": 0.43714746832847595, "rewards/semantic_correctness_reward_func/std": 0.2098701447248459, "rewards/xmlcount_reward_func/mean": 0.7868126034736633, "rewards/xmlcount_reward_func/std": 0.4040831923484802, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 150.5491180419922, "completions/mean_terminated_length": 138.6923065185547, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.20178396141863345, "grad_norm": 0.022640299052000046, "kl": 0.01974773406982422, "learning_rate": 4.17702704859633e-06, "loss": -0.0005, "num_tokens": 210894360.0, "reward": 0.4660325348377228, "reward_std": 0.05717170611023903, "rewards/gemini_judge_reward_func/mean": 0.1707589328289032, "rewards/gemini_judge_reward_func/std": 0.26177191734313965, "rewards/semantic_correctness_reward_func/mean": 0.456332266330719, "rewards/semantic_correctness_reward_func/std": 0.23281851410865784, "rewards/xmlcount_reward_func/mean": 0.7661563158035278, "rewards/xmlcount_reward_func/std": 0.4234122037887573, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 166.65179443359375, "completions/mean_terminated_length": 147.07762145996094, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.2021253894413384, "grad_norm": 0.022002065554261208, "kl": 0.017216205596923828, "learning_rate": 4.160722391827262e-06, "loss": -0.0401, "num_tokens": 211262166.0, "reward": 0.44295012950897217, "reward_std": 0.08097635209560394, "rewards/gemini_judge_reward_func/mean": 0.1629464328289032, "rewards/gemini_judge_reward_func/std": 0.28055402636528015, "rewards/semantic_correctness_reward_func/mean": 0.46779510378837585, "rewards/semantic_correctness_reward_func/std": 0.21671941876411438, "rewards/xmlcount_reward_func/mean": 0.7105312943458557, "rewards/xmlcount_reward_func/std": 0.4553159773349762, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 167.16964721679688, "completions/mean_terminated_length": 151.59091186523438, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.20246681746404335, "grad_norm": 0.020169679075479507, "kl": 0.0156557559967041, "learning_rate": 4.14442691326365e-06, "loss": -0.0042, "num_tokens": 211614188.0, "reward": 0.4513782262802124, "reward_std": 0.06877769529819489, "rewards/gemini_judge_reward_func/mean": 0.1763392835855484, "rewards/gemini_judge_reward_func/std": 0.3034524619579315, "rewards/semantic_correctness_reward_func/mean": 0.43846240639686584, "rewards/semantic_correctness_reward_func/std": 0.20841853320598602, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 164.9241180419922, "completions/mean_terminated_length": 137.21197509765625, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.20280824548674833, "grad_norm": 0.01954479329288006, "kl": 0.018438100814819336, "learning_rate": 4.128140791110243e-06, "loss": -0.0154, "num_tokens": 211984707.0, "reward": 0.4166203439235687, "reward_std": 0.059841644018888474, "rewards/gemini_judge_reward_func/mean": 0.1316964328289032, "rewards/gemini_judge_reward_func/std": 0.23784302175045013, "rewards/semantic_correctness_reward_func/mean": 0.4254585802555084, "rewards/semantic_correctness_reward_func/std": 0.21392248570919037, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903594970703, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 160.84375, "completions/mean_terminated_length": 141.13697814941406, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2031496735094533, "grad_norm": 0.02386937290430069, "kl": 0.017685413360595703, "learning_rate": 4.111864203469457e-06, "loss": 0.0021, "num_tokens": 212359916.0, "reward": 0.4186449646949768, "reward_std": 0.07048141211271286, "rewards/gemini_judge_reward_func/mean": 0.15625, "rewards/gemini_judge_reward_func/std": 0.2691645324230194, "rewards/semantic_correctness_reward_func/mean": 0.4312067925930023, "rewards/semantic_correctness_reward_func/std": 0.18796810507774353, "rewards/xmlcount_reward_func/mean": 0.6747589111328125, "rewards/xmlcount_reward_func/std": 0.4678308665752411, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 163.3125, "completions/mean_terminated_length": 151.62896728515625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.20349110153215824, "grad_norm": 0.020488321781158447, "kl": 0.017343997955322266, "learning_rate": 4.0955973283394525e-06, "loss": -0.029, "num_tokens": 212738934.0, "reward": 0.43855446577072144, "reward_std": 0.0777134820818901, "rewards/gemini_judge_reward_func/mean": 0.1785714328289032, "rewards/gemini_judge_reward_func/std": 0.27494102716445923, "rewards/semantic_correctness_reward_func/mean": 0.4413793087005615, "rewards/semantic_correctness_reward_func/std": 0.2333817183971405, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903892993927, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 158.07589721679688, "completions/mean_terminated_length": 146.32127380371094, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.20383252955486322, "grad_norm": 0.019988469779491425, "kl": 0.014391899108886719, "learning_rate": 4.079340343612165e-06, "loss": -0.0016, "num_tokens": 213079815.0, "reward": 0.4672608971595764, "reward_std": 0.06315002590417862, "rewards/gemini_judge_reward_func/mean": 0.1261160671710968, "rewards/gemini_judge_reward_func/std": 0.2483101785182953, "rewards/semantic_correctness_reward_func/mean": 0.4395720660686493, "rewards/semantic_correctness_reward_func/std": 0.2068413943052292, "rewards/xmlcount_reward_func/mean": 0.8222500681877136, "rewards/xmlcount_reward_func/std": 0.3842346966266632, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 145.5491180419922, "completions/mean_terminated_length": 141.60987854003906, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.20417395757756818, "grad_norm": 0.021274050697684288, "kl": 0.01706838607788086, "learning_rate": 4.063093427071376e-06, "loss": -0.0041, "num_tokens": 213424574.0, "reward": 0.40272268652915955, "reward_std": 0.05240607634186745, "rewards/gemini_judge_reward_func/mean": 0.1216517835855484, "rewards/gemini_judge_reward_func/std": 0.23194913566112518, "rewards/semantic_correctness_reward_func/mean": 0.41180965304374695, "rewards/semantic_correctness_reward_func/std": 0.20743775367736816, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 164.43304443359375, "completions/mean_terminated_length": 148.80453491210938, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.20451538560027313, "grad_norm": 0.021274050697684288, "kl": 0.014577627182006836, "learning_rate": 4.063093427071376e-06, "loss": -0.0257, "num_tokens": 213774683.0, "reward": 0.458513081073761, "reward_std": 0.07192227244377136, "rewards/gemini_judge_reward_func/mean": 0.1037946417927742, "rewards/gemini_judge_reward_func/std": 0.2507770359516144, "rewards/semantic_correctness_reward_func/mean": 0.440475732088089, "rewards/semantic_correctness_reward_func/std": 0.21808552742004395, "rewards/xmlcount_reward_func/mean": 0.8222500681877136, "rewards/xmlcount_reward_func/std": 0.3842346966266632, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 142.44644165039062, "completions/mean_terminated_length": 134.50450134277344, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.2048568136229781, "grad_norm": 0.02189001813530922, "kl": 0.018033504486083984, "learning_rate": 4.046856756390767e-06, "loss": -0.009, "num_tokens": 214127431.0, "reward": 0.4390813112258911, "reward_std": 0.08607413619756699, "rewards/gemini_judge_reward_func/mean": 0.1852678507566452, "rewards/gemini_judge_reward_func/std": 0.2980608642101288, "rewards/semantic_correctness_reward_func/mean": 0.4807279407978058, "rewards/semantic_correctness_reward_func/std": 0.22973637282848358, "rewards/xmlcount_reward_func/mean": 0.6720714569091797, "rewards/xmlcount_reward_func/std": 0.46528491377830505, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 139.8482208251953, "completions/mean_terminated_length": 135.88340759277344, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.20519824164568307, "grad_norm": 0.02204316481947899, "kl": 0.017858505249023438, "learning_rate": 4.03063050913196e-06, "loss": -0.0078, "num_tokens": 214486013.0, "reward": 0.38387420773506165, "reward_std": 0.047508224844932556, "rewards/gemini_judge_reward_func/mean": 0.1238839253783226, "rewards/gemini_judge_reward_func/std": 0.23675422370433807, "rewards/semantic_correctness_reward_func/mean": 0.43824586272239685, "rewards/semantic_correctness_reward_func/std": 0.20813381671905518, "rewards/xmlcount_reward_func/mean": 0.6166786551475525, "rewards/xmlcount_reward_func/std": 0.4832932651042938, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 168.6741180419922, "completions/mean_terminated_length": 145.13302612304688, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.20553966966838802, "grad_norm": 0.01992730423808098, "kl": 0.016730308532714844, "learning_rate": 4.0144148627426e-06, "loss": -0.0043, "num_tokens": 214836156.0, "reward": 0.44369426369667053, "reward_std": 0.05480070784687996, "rewards/gemini_judge_reward_func/mean": 0.1316964328289032, "rewards/gemini_judge_reward_func/std": 0.24709029495716095, "rewards/semantic_correctness_reward_func/mean": 0.42489978671073914, "rewards/semantic_correctness_reward_func/std": 0.19661790132522583, "rewards/xmlcount_reward_func/mean": 0.7650893330574036, "rewards/xmlcount_reward_func/std": 0.423270583152771, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 172.0982208251953, "completions/mean_terminated_length": 152.6483917236328, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.205881097691093, "grad_norm": 0.021027710288763046, "kl": 0.01509237289428711, "learning_rate": 3.998209994554395e-06, "loss": 0.0116, "num_tokens": 215171166.0, "reward": 0.45906171202659607, "reward_std": 0.0753428265452385, "rewards/gemini_judge_reward_func/mean": 0.15625, "rewards/gemini_judge_reward_func/std": 0.28138232231140137, "rewards/semantic_correctness_reward_func/mean": 0.4455583989620209, "rewards/semantic_correctness_reward_func/std": 0.22236627340316772, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 178.75894165039062, "completions/mean_terminated_length": 151.4930877685547, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.20622252571379796, "grad_norm": 0.020675111562013626, "kl": 0.017548084259033203, "learning_rate": 3.982016081781189e-06, "loss": -0.0197, "num_tokens": 215552816.0, "reward": 0.3967207670211792, "reward_std": 0.06549742072820663, "rewards/gemini_judge_reward_func/mean": 0.1049107164144516, "rewards/gemini_judge_reward_func/std": 0.22495238482952118, "rewards/semantic_correctness_reward_func/mean": 0.45103222131729126, "rewards/semantic_correctness_reward_func/std": 0.18998552858829498, "rewards/xmlcount_reward_func/mean": 0.6613750457763672, "rewards/xmlcount_reward_func/std": 0.47500187158584595, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 154.8303680419922, "completions/mean_terminated_length": 147.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.20656395373650294, "grad_norm": 0.021269556134939194, "kl": 0.01663351058959961, "learning_rate": 3.965833301517017e-06, "loss": 0.0136, "num_tokens": 215937378.0, "reward": 0.4003709554672241, "reward_std": 0.07181476801633835, "rewards/gemini_judge_reward_func/mean": 0.1350446492433548, "rewards/gemini_judge_reward_func/std": 0.22932343184947968, "rewards/semantic_correctness_reward_func/mean": 0.43586355447769165, "rewards/semantic_correctness_reward_func/std": 0.20265518128871918, "rewards/xmlcount_reward_func/mean": 0.6479509472846985, "rewards/xmlcount_reward_func/std": 0.47591251134872437, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 166.24554443359375, "completions/mean_terminated_length": 150.64999389648438, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.2069053817592079, "grad_norm": 0.02032295987010002, "kl": 0.016942501068115234, "learning_rate": 3.949661830734172e-06, "loss": 0.0044, "num_tokens": 216288897.0, "reward": 0.4458765387535095, "reward_std": 0.07394890487194061, "rewards/gemini_judge_reward_func/mean": 0.1339285671710968, "rewards/gemini_judge_reward_func/std": 0.25919973850250244, "rewards/semantic_correctness_reward_func/mean": 0.4432217478752136, "rewards/semantic_correctness_reward_func/std": 0.20972265303134918, "rewards/xmlcount_reward_func/mean": 0.7591518759727478, "rewards/xmlcount_reward_func/std": 0.42889249324798584, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 148.90625, "completions/mean_terminated_length": 144.98207092285156, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.20724680978191284, "grad_norm": 0.021965689957141876, "kl": 0.017894744873046875, "learning_rate": 3.9335018462812664e-06, "loss": -0.0234, "num_tokens": 216636048.0, "reward": 0.4070368707180023, "reward_std": 0.06277727335691452, "rewards/gemini_judge_reward_func/mean": 0.1171875, "rewards/gemini_judge_reward_func/std": 0.21161696314811707, "rewards/semantic_correctness_reward_func/mean": 0.44230917096138, "rewards/semantic_correctness_reward_func/std": 0.20678167045116425, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 146.91964721679688, "completions/mean_terminated_length": 146.91964721679688, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.20758823780461783, "grad_norm": 0.02472434565424919, "kl": 0.03131532669067383, "learning_rate": 3.9173535248813026e-06, "loss": 0.0065, "num_tokens": 216973558.0, "reward": 0.4831780791282654, "reward_std": 0.0640798807144165, "rewards/gemini_judge_reward_func/mean": 0.1852678507566452, "rewards/gemini_judge_reward_func/std": 0.28655508160591125, "rewards/semantic_correctness_reward_func/mean": 0.4723544120788574, "rewards/semantic_correctness_reward_func/std": 0.23760126531124115, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 166.8169708251953, "completions/mean_terminated_length": 151.2318115234375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.20792966582732278, "grad_norm": 0.01872190646827221, "kl": 0.011590242385864258, "learning_rate": 3.901217043129735e-06, "loss": 0.008, "num_tokens": 217317493.0, "reward": 0.42884668707847595, "reward_std": 0.06799106299877167, "rewards/gemini_judge_reward_func/mean": 0.1104910746216774, "rewards/gemini_judge_reward_func/std": 0.23151718080043793, "rewards/semantic_correctness_reward_func/mean": 0.4217510223388672, "rewards/semantic_correctness_reward_func/std": 0.19558602571487427, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 159.5491180419922, "completions/mean_terminated_length": 143.83181762695312, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.20827109385002773, "grad_norm": 0.020979879423975945, "kl": 0.018959522247314453, "learning_rate": 3.885092577492543e-06, "loss": -0.009, "num_tokens": 217679120.0, "reward": 0.39373713731765747, "reward_std": 0.05377659946680069, "rewards/gemini_judge_reward_func/mean": 0.1015625, "rewards/gemini_judge_reward_func/std": 0.20642106235027313, "rewards/semantic_correctness_reward_func/mean": 0.4428104758262634, "rewards/semantic_correctness_reward_func/std": 0.2033531814813614, "rewards/xmlcount_reward_func/mean": 0.6613750457763672, "rewards/xmlcount_reward_func/std": 0.47500187158584595, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 150.07144165039062, "completions/mean_terminated_length": 150.07144165039062, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.20861252187273271, "grad_norm": 0.021496569737792015, "kl": 0.01505589485168457, "learning_rate": 3.8689803043043e-06, "loss": 0.0073, "num_tokens": 218027112.0, "reward": 0.45133811235427856, "reward_std": 0.061567071825265884, "rewards/gemini_judge_reward_func/mean": 0.1015625, "rewards/gemini_judge_reward_func/std": 0.2170114368200302, "rewards/semantic_correctness_reward_func/mean": 0.409065306186676, "rewards/semantic_correctness_reward_func/std": 0.21651864051818848, "rewards/xmlcount_reward_func/mean": 0.8222500681877136, "rewards/xmlcount_reward_func/std": 0.3842346966266632, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 150.3616180419922, "completions/mean_terminated_length": 138.5022735595703, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.20895394989543767, "grad_norm": 0.021522024646401405, "kl": 0.01940298080444336, "learning_rate": 3.852880399766243e-06, "loss": -0.0166, "num_tokens": 218409965.0, "reward": 0.38020059466362, "reward_std": 0.05251891911029816, "rewards/gemini_judge_reward_func/mean": 0.1004464253783226, "rewards/gemini_judge_reward_func/std": 0.2330818623304367, "rewards/semantic_correctness_reward_func/mean": 0.4131101965904236, "rewards/semantic_correctness_reward_func/std": 0.1949501633644104, "rewards/xmlcount_reward_func/mean": 0.643500030040741, "rewards/xmlcount_reward_func/std": 0.48071083426475525, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 148.43304443359375, "completions/mean_terminated_length": 144.50672912597656, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.20929537791814262, "grad_norm": 0.021798672154545784, "kl": 0.01602315902709961, "learning_rate": 3.8367930399443495e-06, "loss": -0.0188, "num_tokens": 218773734.0, "reward": 0.39457279443740845, "reward_std": 0.059662409126758575, "rewards/gemini_judge_reward_func/mean": 0.1160714253783226, "rewards/gemini_judge_reward_func/std": 0.23776936531066895, "rewards/semantic_correctness_reward_func/mean": 0.4179709851741791, "rewards/semantic_correctness_reward_func/std": 0.2051243931055069, "rewards/xmlcount_reward_func/mean": 0.6613750457763672, "rewards/xmlcount_reward_func/std": 0.47500187158584595, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 146.7053680419922, "completions/mean_terminated_length": 146.7053680419922, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.2096368059408476, "grad_norm": 0.021316422149538994, "kl": 0.017024993896484375, "learning_rate": 3.820718400767409e-06, "loss": -0.0117, "num_tokens": 219123324.0, "reward": 0.41686007380485535, "reward_std": 0.0637628361582756, "rewards/gemini_judge_reward_func/mean": 0.1049107164144516, "rewards/gemini_judge_reward_func/std": 0.21863414347171783, "rewards/semantic_correctness_reward_func/mean": 0.462336003780365, "rewards/semantic_correctness_reward_func/std": 0.19964618980884552, "rewards/xmlcount_reward_func/mean": 0.7060714364051819, "rewards/xmlcount_reward_func/std": 0.4524170756340027, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 164.5669708251953, "completions/mean_terminated_length": 152.90045166015625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.20997823396355256, "grad_norm": 0.021478435024619102, "kl": 0.017641544342041016, "learning_rate": 3.8046566580251e-06, "loss": 0.0127, "num_tokens": 219468403.0, "reward": 0.44625985622406006, "reward_std": 0.043265651911497116, "rewards/gemini_judge_reward_func/mean": 0.1071428582072258, "rewards/gemini_judge_reward_func/std": 0.20149186253547668, "rewards/semantic_correctness_reward_func/mean": 0.4440133273601532, "rewards/semantic_correctness_reward_func/std": 0.21532121300697327, "rewards/xmlcount_reward_func/mean": 0.7864999771118164, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 157.15179443359375, "completions/mean_terminated_length": 145.38462829589844, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.2103196619862575, "grad_norm": 0.02019723691046238, "kl": 0.014769792556762695, "learning_rate": 3.7886079873660693e-06, "loss": -0.0266, "num_tokens": 219809233.0, "reward": 0.446768581867218, "reward_std": 0.06777739524841309, "rewards/gemini_judge_reward_func/mean": 0.1183035746216774, "rewards/gemini_judge_reward_func/std": 0.2482219785451889, "rewards/semantic_correctness_reward_func/mean": 0.42423561215400696, "rewards/semantic_correctness_reward_func/std": 0.18544653058052063, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 162.58482360839844, "completions/mean_terminated_length": 146.9227294921875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2106610900089625, "grad_norm": 0.02150016464293003, "kl": 0.015299558639526367, "learning_rate": 3.7725725642960047e-06, "loss": 0.0054, "num_tokens": 220172916.0, "reward": 0.448917955160141, "reward_std": 0.049533385783433914, "rewards/gemini_judge_reward_func/mean": 0.0970982164144516, "rewards/gemini_judge_reward_func/std": 0.22285966575145721, "rewards/semantic_correctness_reward_func/mean": 0.41483062505722046, "rewards/semantic_correctness_reward_func/std": 0.2128439098596573, "rewards/xmlcount_reward_func/mean": 0.8177813291549683, "rewards/xmlcount_reward_func/std": 0.3879494369029999, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 142.2678680419922, "completions/mean_terminated_length": 142.2678680419922, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.21100251803166745, "grad_norm": 0.023679913952946663, "kl": 0.01729297637939453, "learning_rate": 3.756550564175727e-06, "loss": -0.0221, "num_tokens": 220527948.0, "reward": 0.4694575071334839, "reward_std": 0.08708461374044418, "rewards/gemini_judge_reward_func/mean": 0.1696428507566452, "rewards/gemini_judge_reward_func/std": 0.27956220507621765, "rewards/semantic_correctness_reward_func/mean": 0.4707517921924591, "rewards/semantic_correctness_reward_func/std": 0.2081519216299057, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 173.17857360839844, "completions/mean_terminated_length": 145.7327117919922, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.2113439460543724, "grad_norm": 0.02082088217139244, "kl": 0.014158248901367188, "learning_rate": 3.7405421622192607e-06, "loss": -0.02, "num_tokens": 220909056.0, "reward": 0.4316154420375824, "reward_std": 0.06896168738603592, "rewards/gemini_judge_reward_func/mean": 0.1350446492433548, "rewards/gemini_judge_reward_func/std": 0.24239464104175568, "rewards/semantic_correctness_reward_func/mean": 0.4688715636730194, "rewards/semantic_correctness_reward_func/std": 0.17802277207374573, "rewards/xmlcount_reward_func/mean": 0.7095580697059631, "rewards/xmlcount_reward_func/std": 0.45223551988601685, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 152.86607360839844, "completions/mean_terminated_length": 145.0180206298828, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.21168537407707738, "grad_norm": 0.020124070346355438, "kl": 0.014010190963745117, "learning_rate": 3.7245475334919246e-06, "loss": -0.0191, "num_tokens": 221245478.0, "reward": 0.41337040066719055, "reward_std": 0.0564776174724102, "rewards/gemini_judge_reward_func/mean": 0.1116071417927742, "rewards/gemini_judge_reward_func/std": 0.191873237490654, "rewards/semantic_correctness_reward_func/mean": 0.4136374294757843, "rewards/semantic_correctness_reward_func/std": 0.1768476963043213, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 159.0178680419922, "completions/mean_terminated_length": 151.22523498535156, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.21202680209978234, "grad_norm": 0.0207071453332901, "kl": 0.014678478240966797, "learning_rate": 3.7085668529084183e-06, "loss": -0.0211, "num_tokens": 221594150.0, "reward": 0.4323264956474304, "reward_std": 0.06878480315208435, "rewards/gemini_judge_reward_func/mean": 0.1015625, "rewards/gemini_judge_reward_func/std": 0.2170114368200302, "rewards/semantic_correctness_reward_func/mean": 0.4190162122249603, "rewards/semantic_correctness_reward_func/std": 0.20656156539916992, "rewards/xmlcount_reward_func/mean": 0.7697455286979675, "rewards/xmlcount_reward_func/std": 0.4218544363975525, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 166.12054443359375, "completions/mean_terminated_length": 146.53424072265625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.21236823012248732, "grad_norm": 0.01952914334833622, "kl": 0.014682769775390625, "learning_rate": 3.6926002952309015e-06, "loss": -0.0159, "num_tokens": 221943565.0, "reward": 0.4331689774990082, "reward_std": 0.06450119614601135, "rewards/gemini_judge_reward_func/mean": 0.1651785671710968, "rewards/gemini_judge_reward_func/std": 0.24730288982391357, "rewards/semantic_correctness_reward_func/mean": 0.4769876301288605, "rewards/semantic_correctness_reward_func/std": 0.22601205110549927, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 150.64732360839844, "completions/mean_terminated_length": 142.77928161621094, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.21270965814519227, "grad_norm": 0.02059789188206196, "kl": 0.014481544494628906, "learning_rate": 3.676648035067093e-06, "loss": -0.0166, "num_tokens": 222291158.0, "reward": 0.42687147855758667, "reward_std": 0.06625192612409592, "rewards/gemini_judge_reward_func/mean": 0.1227678582072258, "rewards/gemini_judge_reward_func/std": 0.23555946350097656, "rewards/semantic_correctness_reward_func/mean": 0.4230715334415436, "rewards/semantic_correctness_reward_func/std": 0.18972373008728027, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 153.20982360839844, "completions/mean_terminated_length": 149.30494689941406, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.21305108616789722, "grad_norm": 0.020177403464913368, "kl": 0.01506662368774414, "learning_rate": 3.6607102468683524e-06, "loss": 0.002, "num_tokens": 222639001.0, "reward": 0.4309717118740082, "reward_std": 0.06068568304181099, "rewards/gemini_judge_reward_func/mean": 0.1294642835855484, "rewards/gemini_judge_reward_func/std": 0.24140411615371704, "rewards/semantic_correctness_reward_func/mean": 0.4302067160606384, "rewards/semantic_correctness_reward_func/std": 0.21803805232048035, "rewards/xmlcount_reward_func/mean": 0.7328616380691528, "rewards/xmlcount_reward_func/std": 0.4410996735095978, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 154.47769165039062, "completions/mean_terminated_length": 154.47769165039062, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.2133925141906022, "grad_norm": 0.020750742405653, "kl": 0.018827438354492188, "learning_rate": 3.64478710492778e-06, "loss": -0.0, "num_tokens": 222961280.0, "reward": 0.4677436947822571, "reward_std": 0.056711845099925995, "rewards/gemini_judge_reward_func/mean": 0.1595982164144516, "rewards/gemini_judge_reward_func/std": 0.2677079737186432, "rewards/semantic_correctness_reward_func/mean": 0.4465217590332031, "rewards/semantic_correctness_reward_func/std": 0.2243902087211609, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 155.09375, "completions/mean_terminated_length": 143.29864501953125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.21373394221330716, "grad_norm": 0.026382334530353546, "kl": 0.022363662719726562, "learning_rate": 3.628878783378302e-06, "loss": -0.013, "num_tokens": 223344509.0, "reward": 0.40488913655281067, "reward_std": 0.06225220486521721, "rewards/gemini_judge_reward_func/mean": 0.1216517835855484, "rewards/gemini_judge_reward_func/std": 0.23435331881046295, "rewards/semantic_correctness_reward_func/mean": 0.42264196276664734, "rewards/semantic_correctness_reward_func/std": 0.20638230443000793, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 669.0, "completions/mean_length": 161.2544708251953, "completions/mean_terminated_length": 145.56817626953125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.2140753702360121, "grad_norm": 0.020892329514026642, "kl": 0.016319751739501953, "learning_rate": 3.6129854561907786e-06, "loss": -0.0176, "num_tokens": 223712174.0, "reward": 0.4206882119178772, "reward_std": 0.04852227121591568, "rewards/gemini_judge_reward_func/mean": 0.0881696417927742, "rewards/gemini_judge_reward_func/std": 0.17807073891162872, "rewards/semantic_correctness_reward_func/mean": 0.4256015419960022, "rewards/semantic_correctness_reward_func/std": 0.20432168245315552, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 155.27679443359375, "completions/mean_terminated_length": 147.45045471191406, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2144167982587171, "grad_norm": 0.023052629083395004, "kl": 0.01653289794921875, "learning_rate": 3.5971072971720844e-06, "loss": 0.0098, "num_tokens": 224050572.0, "reward": 0.4531470239162445, "reward_std": 0.05186094716191292, "rewards/gemini_judge_reward_func/mean": 0.1439732164144516, "rewards/gemini_judge_reward_func/std": 0.2692737579345703, "rewards/semantic_correctness_reward_func/mean": 0.44053834676742554, "rewards/semantic_correctness_reward_func/std": 0.2101813703775406, "rewards/xmlcount_reward_func/mean": 0.7686249613761902, "rewards/xmlcount_reward_func/std": 0.42356839776039124, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 170.14732360839844, "completions/mean_terminated_length": 146.64678955078125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.21475822628142205, "grad_norm": 0.02114696241915226, "kl": 0.019293546676635742, "learning_rate": 3.581244479963225e-06, "loss": -0.0485, "num_tokens": 224408285.0, "reward": 0.42841026186943054, "reward_std": 0.07736083120107651, "rewards/gemini_judge_reward_func/mean": 0.125, "rewards/gemini_judge_reward_func/std": 0.22708921134471893, "rewards/semantic_correctness_reward_func/mean": 0.43763142824172974, "rewards/semantic_correctness_reward_func/std": 0.21282999217510223, "rewards/xmlcount_reward_func/mean": 0.7272098660469055, "rewards/xmlcount_reward_func/std": 0.4462122321128845, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 147.69644165039062, "completions/mean_terminated_length": 139.8018035888672, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.215099654304127, "grad_norm": 0.02138863503932953, "kl": 0.016696453094482422, "learning_rate": 3.56539717803743e-06, "loss": -0.0161, "num_tokens": 224749813.0, "reward": 0.44623899459838867, "reward_std": 0.06276748329401016, "rewards/gemini_judge_reward_func/mean": 0.1049107164144516, "rewards/gemini_judge_reward_func/std": 0.22244662046432495, "rewards/semantic_correctness_reward_func/mean": 0.4841232895851135, "rewards/semantic_correctness_reward_func/std": 0.20933982729911804, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356839776039124, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 146.79019165039062, "completions/mean_terminated_length": 142.85650634765625, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.21544108232683198, "grad_norm": 0.0239239614456892, "kl": 0.018682479858398438, "learning_rate": 3.5495655646982506e-06, "loss": -0.0027, "num_tokens": 225095170.0, "reward": 0.44244566559791565, "reward_std": 0.0652119442820549, "rewards/gemini_judge_reward_func/mean": 0.1573660671710968, "rewards/gemini_judge_reward_func/std": 0.27110758423805237, "rewards/semantic_correctness_reward_func/mean": 0.46749600768089294, "rewards/semantic_correctness_reward_func/std": 0.21619705855846405, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 144.5357208251953, "completions/mean_terminated_length": 144.5357208251953, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.21578251034953694, "grad_norm": 0.0239239614456892, "kl": 0.016391754150390625, "learning_rate": 3.5495655646982506e-06, "loss": 0.0224, "num_tokens": 225443274.0, "reward": 0.4401990473270416, "reward_std": 0.06552834808826447, "rewards/gemini_judge_reward_func/mean": 0.1127232164144516, "rewards/gemini_judge_reward_func/std": 0.21921423077583313, "rewards/semantic_correctness_reward_func/mean": 0.41371825337409973, "rewards/semantic_correctness_reward_func/std": 0.21398547291755676, "rewards/xmlcount_reward_func/mean": 0.7809152603149414, "rewards/xmlcount_reward_func/std": 0.4144832491874695, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 157.38839721679688, "completions/mean_terminated_length": 149.58108520507812, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.2161239383722419, "grad_norm": 0.019696949049830437, "kl": 0.017939090728759766, "learning_rate": 3.533749813077677e-06, "loss": -0.05, "num_tokens": 225808005.0, "reward": 0.46345487236976624, "reward_std": 0.08401855826377869, "rewards/gemini_judge_reward_func/mean": 0.1540178507566452, "rewards/gemini_judge_reward_func/std": 0.27251842617988586, "rewards/semantic_correctness_reward_func/mean": 0.4541133940219879, "rewards/semantic_correctness_reward_func/std": 0.20422282814979553, "rewards/xmlcount_reward_func/mean": 0.7775625586509705, "rewards/xmlcount_reward_func/std": 0.4177508056163788, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 175.32144165039062, "completions/mean_terminated_length": 155.94520568847656, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.21646536639494687, "grad_norm": 0.020858891308307648, "kl": 0.016012191772460938, "learning_rate": 3.517950096134232e-06, "loss": -0.0292, "num_tokens": 226140553.0, "reward": 0.4431969225406647, "reward_std": 0.05470741167664528, "rewards/gemini_judge_reward_func/mean": 0.1104910746216774, "rewards/gemini_judge_reward_func/std": 0.21389874815940857, "rewards/semantic_correctness_reward_func/mean": 0.4220024645328522, "rewards/semantic_correctness_reward_func/std": 0.20806531608104706, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 158.625, "completions/mean_terminated_length": 154.74440002441406, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.21680679441765183, "grad_norm": 0.020396729931235313, "kl": 0.012579917907714844, "learning_rate": 3.5021665866510924e-06, "loss": -0.013, "num_tokens": 226466773.0, "reward": 0.45861443877220154, "reward_std": 0.06587579846382141, "rewards/gemini_judge_reward_func/mean": 0.1194196417927742, "rewards/gemini_judge_reward_func/std": 0.22202719748020172, "rewards/semantic_correctness_reward_func/mean": 0.4454827308654785, "rewards/semantic_correctness_reward_func/std": 0.2230159491300583, "rewards/xmlcount_reward_func/mean": 0.8043751120567322, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 159.45982360839844, "completions/mean_terminated_length": 147.72398376464844, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.21714822244035678, "grad_norm": 0.02042062021791935, "kl": 0.015841245651245117, "learning_rate": 3.4863994572341845e-06, "loss": -0.0106, "num_tokens": 226816248.0, "reward": 0.42706283926963806, "reward_std": 0.06164560839533806, "rewards/gemini_judge_reward_func/mean": 0.1116071417927742, "rewards/gemini_judge_reward_func/std": 0.23037132620811462, "rewards/semantic_correctness_reward_func/mean": 0.41059979796409607, "rewards/semantic_correctness_reward_func/std": 0.1852397918701172, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 157.96875, "completions/mean_terminated_length": 150.1666717529297, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.21748965046306176, "grad_norm": 0.02099723368883133, "kl": 0.012554168701171875, "learning_rate": 3.470648880310313e-06, "loss": -0.0144, "num_tokens": 227145813.0, "reward": 0.5042933821678162, "reward_std": 0.06609771400690079, "rewards/gemini_judge_reward_func/mean": 0.1707589328289032, "rewards/gemini_judge_reward_func/std": 0.2563627362251282, "rewards/semantic_correctness_reward_func/mean": 0.4639488756656647, "rewards/semantic_correctness_reward_func/std": 0.22148670256137848, "rewards/xmlcount_reward_func/mean": 0.8580000996589661, "rewards/xmlcount_reward_func/std": 0.35106155276298523, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 160.8794708251953, "completions/mean_terminated_length": 149.16290283203125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.21783107848576672, "grad_norm": 0.020512668415904045, "kl": 0.01687908172607422, "learning_rate": 3.4549150281252635e-06, "loss": 0.0107, "num_tokens": 227496678.0, "reward": 0.435072660446167, "reward_std": 0.05142156034708023, "rewards/gemini_judge_reward_func/mean": 0.1049107164144516, "rewards/gemini_judge_reward_func/std": 0.20812636613845825, "rewards/semantic_correctness_reward_func/mean": 0.4282917380332947, "rewards/semantic_correctness_reward_func/std": 0.22628170251846313, "rewards/xmlcount_reward_func/mean": 0.7686249613761902, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 148.72769165039062, "completions/mean_terminated_length": 148.72769165039062, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.21817250650847167, "grad_norm": 0.021282676607370377, "kl": 0.013463735580444336, "learning_rate": 3.4391980727419206e-06, "loss": 0.0127, "num_tokens": 227858717.0, "reward": 0.4656349718570709, "reward_std": 0.08616151660680771, "rewards/gemini_judge_reward_func/mean": 0.1428571492433548, "rewards/gemini_judge_reward_func/std": 0.26726123690605164, "rewards/semantic_correctness_reward_func/mean": 0.4337104260921478, "rewards/semantic_correctness_reward_func/std": 0.2192811518907547, "rewards/xmlcount_reward_func/mean": 0.8043750524520874, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 173.65626525878906, "completions/mean_terminated_length": 150.25228881835938, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.21851393453117665, "grad_norm": 0.018349410966038704, "kl": 0.014457941055297852, "learning_rate": 3.423498186038393e-06, "loss": -0.0335, "num_tokens": 228224992.0, "reward": 0.4189697206020355, "reward_std": 0.06830798089504242, "rewards/gemini_judge_reward_func/mean": 0.1328125, "rewards/gemini_judge_reward_func/std": 0.26774534583091736, "rewards/semantic_correctness_reward_func/mean": 0.4349732995033264, "rewards/semantic_correctness_reward_func/std": 0.21521888673305511, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903892993927, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 159.75, "completions/mean_terminated_length": 151.96397399902344, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.2188553625538816, "grad_norm": 0.02072848007082939, "kl": 0.014017105102539062, "learning_rate": 3.4078155397061243e-06, "loss": 0.0124, "num_tokens": 228581368.0, "reward": 0.43587082624435425, "reward_std": 0.06837836652994156, "rewards/gemini_judge_reward_func/mean": 0.1272321492433548, "rewards/gemini_judge_reward_func/std": 0.23316773772239685, "rewards/semantic_correctness_reward_func/mean": 0.4233896732330322, "rewards/semantic_correctness_reward_func/std": 0.21983756124973297, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 175.80357360839844, "completions/mean_terminated_length": 156.4383544921875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.2191967905765866, "grad_norm": 0.0204218327999115, "kl": 0.015043258666992188, "learning_rate": 3.3921503052480243e-06, "loss": -0.0068, "num_tokens": 228939304.0, "reward": 0.41249316930770874, "reward_std": 0.04543416202068329, "rewards/gemini_judge_reward_func/mean": 0.09375, "rewards/gemini_judge_reward_func/std": 0.20537890493869781, "rewards/semantic_correctness_reward_func/mean": 0.41145679354667664, "rewards/semantic_correctness_reward_func/std": 0.19346196949481964, "rewards/xmlcount_reward_func/mean": 0.7317544221878052, "rewards/xmlcount_reward_func/std": 0.4439154863357544, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 157.9375, "completions/mean_terminated_length": 146.18099975585938, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.21953821859929154, "grad_norm": 0.020854616537690163, "kl": 0.014810562133789062, "learning_rate": 3.3765026539765832e-06, "loss": -0.028, "num_tokens": 229304102.0, "reward": 0.4532719552516937, "reward_std": 0.07113207876682281, "rewards/gemini_judge_reward_func/mean": 0.1428571492433548, "rewards/gemini_judge_reward_func/std": 0.26935040950775146, "rewards/semantic_correctness_reward_func/mean": 0.4612525403499603, "rewards/semantic_correctness_reward_func/std": 0.216531440615654, "rewards/xmlcount_reward_func/mean": 0.7596964240074158, "rewards/xmlcount_reward_func/std": 0.423846960067749, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 174.79019165039062, "completions/mean_terminated_length": 147.3963165283203, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.2198796466219965, "grad_norm": 0.022890524938702583, "kl": 0.015017271041870117, "learning_rate": 3.3608727570120114e-06, "loss": 0.0367, "num_tokens": 229678923.0, "reward": 0.40588462352752686, "reward_std": 0.05438840389251709, "rewards/gemini_judge_reward_func/mean": 0.1305803507566452, "rewards/gemini_judge_reward_func/std": 0.2719554007053375, "rewards/semantic_correctness_reward_func/mean": 0.42761939764022827, "rewards/semantic_correctness_reward_func/std": 0.22883787751197815, "rewards/xmlcount_reward_func/mean": 0.6703214645385742, "rewards/xmlcount_reward_func/std": 0.4670778214931488, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 143.00894165039062, "completions/mean_terminated_length": 143.00894165039062, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.22022107464470148, "grad_norm": 0.020465996116399765, "kl": 0.012059688568115234, "learning_rate": 3.3452607852803585e-06, "loss": 0.0032, "num_tokens": 230010125.0, "reward": 0.48016557097435, "reward_std": 0.07235633581876755, "rewards/gemini_judge_reward_func/mean": 0.1495535671710968, "rewards/gemini_judge_reward_func/std": 0.2515864968299866, "rewards/semantic_correctness_reward_func/mean": 0.45722055435180664, "rewards/semantic_correctness_reward_func/std": 0.24177604913711548, "rewards/xmlcount_reward_func/mean": 0.8222500681877136, "rewards/xmlcount_reward_func/std": 0.3842346966266632, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 164.90179443359375, "completions/mean_terminated_length": 145.28765869140625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.22056250266740643, "grad_norm": 0.021560531109571457, "kl": 0.017102718353271484, "learning_rate": 3.3296669095116454e-06, "loss": -0.0028, "num_tokens": 230365327.0, "reward": 0.44443637132644653, "reward_std": 0.09491990506649017, "rewards/gemini_judge_reward_func/mean": 0.1886160671710968, "rewards/gemini_judge_reward_func/std": 0.3165358304977417, "rewards/semantic_correctness_reward_func/mean": 0.4797532856464386, "rewards/semantic_correctness_reward_func/std": 0.20572076737880707, "rewards/xmlcount_reward_func/mean": 0.6825982332229614, "rewards/xmlcount_reward_func/std": 0.46513426303863525, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 159.3303680419922, "completions/mean_terminated_length": 143.6090850830078, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.22090393069011138, "grad_norm": 0.022877030074596405, "kl": 0.021349430084228516, "learning_rate": 3.3140913002379993e-06, "loss": -0.0177, "num_tokens": 230736949.0, "reward": 0.4325730800628662, "reward_std": 0.07304774224758148, "rewards/gemini_judge_reward_func/mean": 0.1674107164144516, "rewards/gemini_judge_reward_func/std": 0.29071658849716187, "rewards/semantic_correctness_reward_func/mean": 0.4695436656475067, "rewards/semantic_correctness_reward_func/std": 0.2264811396598816, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853896975517273, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 179.88394165039062, "completions/mean_terminated_length": 160.6118621826172, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.22124535871281636, "grad_norm": 0.019290877506136894, "kl": 0.013090372085571289, "learning_rate": 3.298534127791785e-06, "loss": -0.0044, "num_tokens": 231096655.0, "reward": 0.4430634081363678, "reward_std": 0.06918629258871078, "rewards/gemini_judge_reward_func/mean": 0.1417410671710968, "rewards/gemini_judge_reward_func/std": 0.2735517919063568, "rewards/semantic_correctness_reward_func/mean": 0.4303347170352936, "rewards/semantic_correctness_reward_func/std": 0.23128339648246765, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 150.9241180419922, "completions/mean_terminated_length": 143.05856323242188, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.22158678673552132, "grad_norm": 0.022577356547117233, "kl": 0.017051219940185547, "learning_rate": 3.2829955623037536e-06, "loss": -0.0341, "num_tokens": 231438194.0, "reward": 0.4817308783531189, "reward_std": 0.07706872373819351, "rewards/gemini_judge_reward_func/mean": 0.2109375, "rewards/gemini_judge_reward_func/std": 0.3183174729347229, "rewards/semantic_correctness_reward_func/mean": 0.4852793514728546, "rewards/semantic_correctness_reward_func/std": 0.22220410406589508, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 1024.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 176.82144165039062, "completions/mean_terminated_length": 145.44444274902344, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.22192821475822627, "grad_norm": 0.02091185748577118, "kl": 0.01692962646484375, "learning_rate": 3.267475773701161e-06, "loss": -0.0186, "num_tokens": 231813410.0, "reward": 0.41409575939178467, "reward_std": 0.0569186732172966, "rewards/gemini_judge_reward_func/mean": 0.140625, "rewards/gemini_judge_reward_func/std": 0.24554608762264252, "rewards/semantic_correctness_reward_func/mean": 0.43966618180274963, "rewards/semantic_correctness_reward_func/std": 0.20007139444351196, "rewards/xmlcount_reward_func/mean": 0.6747812628746033, "rewards/xmlcount_reward_func/std": 0.4702270030975342, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 158.32589721679688, "completions/mean_terminated_length": 150.5270233154297, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.22226964278093125, "grad_norm": 0.021968696266412735, "kl": 0.017704010009765625, "learning_rate": 3.251974931705933e-06, "loss": -0.0113, "num_tokens": 232185319.0, "reward": 0.39724671840667725, "reward_std": 0.0701524019241333, "rewards/gemini_judge_reward_func/mean": 0.1383928507566452, "rewards/gemini_judge_reward_func/std": 0.2468166947364807, "rewards/semantic_correctness_reward_func/mean": 0.44034072756767273, "rewards/semantic_correctness_reward_func/std": 0.22161860764026642, "rewards/xmlcount_reward_func/mean": 0.6345536112785339, "rewards/xmlcount_reward_func/std": 0.4786224365234375, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 154.36607360839844, "completions/mean_terminated_length": 142.56109619140625, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.2226110708036362, "grad_norm": 0.02309529297053814, "kl": 0.01496124267578125, "learning_rate": 3.236493205832795e-06, "loss": -0.016, "num_tokens": 232526501.0, "reward": 0.43352967500686646, "reward_std": 0.060899149626493454, "rewards/gemini_judge_reward_func/mean": 0.109375, "rewards/gemini_judge_reward_func/std": 0.22530809044837952, "rewards/semantic_correctness_reward_func/mean": 0.42160341143608093, "rewards/semantic_correctness_reward_func/std": 0.24161309003829956, "rewards/xmlcount_reward_func/mean": 0.7636473774909973, "rewards/xmlcount_reward_func/std": 0.42616090178489685, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 165.7991180419922, "completions/mean_terminated_length": 150.19544982910156, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.22295249882634116, "grad_norm": 0.020331187173724174, "kl": 0.018056392669677734, "learning_rate": 3.2210307653874175e-06, "loss": -0.0365, "num_tokens": 232879984.0, "reward": 0.4239354431629181, "reward_std": 0.06748870015144348, "rewards/gemini_judge_reward_func/mean": 0.1160714253783226, "rewards/gemini_judge_reward_func/std": 0.24011527001857758, "rewards/semantic_correctness_reward_func/mean": 0.42178425192832947, "rewards/semantic_correctness_reward_func/std": 0.19532445073127747, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 931.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 151.60714721679688, "completions/mean_terminated_length": 151.60714721679688, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.22329392684904614, "grad_norm": 0.019777696579694748, "kl": 0.015816688537597656, "learning_rate": 3.205587779464576e-06, "loss": -0.0058, "num_tokens": 233223568.0, "reward": 0.4531329274177551, "reward_std": 0.06646832823753357, "rewards/gemini_judge_reward_func/mean": 0.1841517835855484, "rewards/gemini_judge_reward_func/std": 0.2982853949069977, "rewards/semantic_correctness_reward_func/mean": 0.4673609137535095, "rewards/semantic_correctness_reward_func/std": 0.23493170738220215, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 155.05357360839844, "completions/mean_terminated_length": 143.25791931152344, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.2236353548717511, "grad_norm": 0.02039613015949726, "kl": 0.014897584915161133, "learning_rate": 3.1901644169462854e-06, "loss": -0.0026, "num_tokens": 233578128.0, "reward": 0.46499761939048767, "reward_std": 0.0666971355676651, "rewards/gemini_judge_reward_func/mean": 0.1674107164144516, "rewards/gemini_judge_reward_func/std": 0.27689096331596375, "rewards/semantic_correctness_reward_func/mean": 0.4529164731502533, "rewards/semantic_correctness_reward_func/std": 0.22561688721179962, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 156.30357360839844, "completions/mean_terminated_length": 144.52488708496094, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.22397678289445605, "grad_norm": 0.020914804190397263, "kl": 0.016788959503173828, "learning_rate": 3.1747608464999723e-06, "loss": 0.0177, "num_tokens": 233943512.0, "reward": 0.4334987998008728, "reward_std": 0.05882357805967331, "rewards/gemini_judge_reward_func/mean": 0.1316964328289032, "rewards/gemini_judge_reward_func/std": 0.2698609232902527, "rewards/semantic_correctness_reward_func/mean": 0.43835094571113586, "rewards/semantic_correctness_reward_func/std": 0.20942819118499756, "rewards/xmlcount_reward_func/mean": 0.7328750491142273, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 153.4732208251953, "completions/mean_terminated_length": 145.63063049316406, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.22431821091716103, "grad_norm": 0.01971406303346157, "kl": 0.014064550399780273, "learning_rate": 3.1593772365766107e-06, "loss": -0.0151, "num_tokens": 234292622.0, "reward": 0.45534926652908325, "reward_std": 0.06678459793329239, "rewards/gemini_judge_reward_func/mean": 0.1651785671710968, "rewards/gemini_judge_reward_func/std": 0.30513831973075867, "rewards/semantic_correctness_reward_func/mean": 0.44488900899887085, "rewards/semantic_correctness_reward_func/std": 0.22186164557933807, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 152.52232360839844, "completions/mean_terminated_length": 148.61436462402344, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.22465963893986599, "grad_norm": 0.021667256951332092, "kl": 0.014690876007080078, "learning_rate": 3.1440137554088957e-06, "loss": 0.0193, "num_tokens": 234646675.0, "reward": 0.42377644777297974, "reward_std": 0.05661001801490784, "rewards/gemini_judge_reward_func/mean": 0.1116071417927742, "rewards/gemini_judge_reward_func/std": 0.23158472776412964, "rewards/semantic_correctness_reward_func/mean": 0.3941677212715149, "rewards/semantic_correctness_reward_func/std": 0.18619193136692047, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 166.2053680419922, "completions/mean_terminated_length": 154.56109619140625, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.22500106696257094, "grad_norm": 0.021019073203206062, "kl": 0.016040325164794922, "learning_rate": 3.128670571009399e-06, "loss": -0.0183, "num_tokens": 235011845.0, "reward": 0.45211324095726013, "reward_std": 0.07263405621051788, "rewards/gemini_judge_reward_func/mean": 0.1439732164144516, "rewards/gemini_judge_reward_func/std": 0.23717662692070007, "rewards/semantic_correctness_reward_func/mean": 0.4532356858253479, "rewards/semantic_correctness_reward_func/std": 0.22988630831241608, "rewards/xmlcount_reward_func/mean": 0.7596920728683472, "rewards/xmlcount_reward_func/std": 0.42566749453544617, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 153.04464721679688, "completions/mean_terminated_length": 145.1981964111328, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.22534249498527592, "grad_norm": 0.02109494060277939, "kl": 0.014479875564575195, "learning_rate": 3.1133478511687217e-06, "loss": -0.0027, "num_tokens": 235377351.0, "reward": 0.43420976400375366, "reward_std": 0.0640973299741745, "rewards/gemini_judge_reward_func/mean": 0.1350446492433548, "rewards/gemini_judge_reward_func/std": 0.2365427315235138, "rewards/semantic_correctness_reward_func/mean": 0.43520933389663696, "rewards/semantic_correctness_reward_func/std": 0.20545201003551483, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 151.85714721679688, "completions/mean_terminated_length": 136.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.22568392300798087, "grad_norm": 0.02080109529197216, "kl": 0.018342018127441406, "learning_rate": 3.0980457634536775e-06, "loss": 0.003, "num_tokens": 235770759.0, "reward": 0.38216474652290344, "reward_std": 0.05819929018616676, "rewards/gemini_judge_reward_func/mean": 0.1082589253783226, "rewards/gemini_judge_reward_func/std": 0.2264685183763504, "rewards/semantic_correctness_reward_func/mean": 0.3983592689037323, "rewards/semantic_correctness_reward_func/std": 0.18890590965747833, "rewards/xmlcount_reward_func/mean": 0.6479732394218445, "rewards/xmlcount_reward_func/std": 0.47640955448150635, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 165.0803680419922, "completions/mean_terminated_length": 161.2287139892578, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.22602535103068586, "grad_norm": 0.019101275131106377, "kl": 0.011385202407836914, "learning_rate": 3.082764475205442e-06, "loss": -0.0045, "num_tokens": 236104989.0, "reward": 0.48020070791244507, "reward_std": 0.07216404378414154, "rewards/gemini_judge_reward_func/mean": 0.1551339328289032, "rewards/gemini_judge_reward_func/std": 0.2785017192363739, "rewards/semantic_correctness_reward_func/mean": 0.4462354779243469, "rewards/semantic_correctness_reward_func/std": 0.223235622048378, "rewards/xmlcount_reward_func/mean": 0.8222500085830688, "rewards/xmlcount_reward_func/std": 0.3842346966266632, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 166.8616180419922, "completions/mean_terminated_length": 151.2772674560547, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.2263667790533908, "grad_norm": 0.020711250603199005, "kl": 0.016867637634277344, "learning_rate": 3.06750415353774e-06, "loss": 0.0041, "num_tokens": 236459250.0, "reward": 0.45459437370300293, "reward_std": 0.07294096797704697, "rewards/gemini_judge_reward_func/mean": 0.1283482164144516, "rewards/gemini_judge_reward_func/std": 0.23554618656635284, "rewards/semantic_correctness_reward_func/mean": 0.4790251851081848, "rewards/semantic_correctness_reward_func/std": 0.18285901844501495, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 150.6875, "completions/mean_terminated_length": 146.77130126953125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.22670820707609576, "grad_norm": 0.021186042577028275, "kl": 0.016530513763427734, "learning_rate": 3.052264965335e-06, "loss": 0.0128, "num_tokens": 236802736.0, "reward": 0.4258687198162079, "reward_std": 0.05062773451209068, "rewards/gemini_judge_reward_func/mean": 0.1071428582072258, "rewards/gemini_judge_reward_func/std": 0.2110041379928589, "rewards/semantic_correctness_reward_func/mean": 0.3867451846599579, "rewards/semantic_correctness_reward_func/std": 0.18891946971416473, "rewards/xmlcount_reward_func/mean": 0.7641562819480896, "rewards/xmlcount_reward_func/std": 0.4263768792152405, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 157.95982360839844, "completions/mean_terminated_length": 150.15765380859375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.22704963509880075, "grad_norm": 0.01883574016392231, "kl": 0.01505136489868164, "learning_rate": 3.0370470772505433e-06, "loss": 0.0167, "num_tokens": 237155743.0, "reward": 0.4579858183860779, "reward_std": 0.07837632298469543, "rewards/gemini_judge_reward_func/mean": 0.1796875, "rewards/gemini_judge_reward_func/std": 0.28767722845077515, "rewards/semantic_correctness_reward_func/mean": 0.42905402183532715, "rewards/semantic_correctness_reward_func/std": 0.21172691881656647, "rewards/xmlcount_reward_func/mean": 0.7507500648498535, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 149.35269165039062, "completions/mean_terminated_length": 141.4729766845703, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.2273910631215057, "grad_norm": 0.02011125721037388, "kl": 0.01688098907470703, "learning_rate": 3.02185065570476e-06, "loss": 0.0042, "num_tokens": 237510790.0, "reward": 0.3963378071784973, "reward_std": 0.05887097865343094, "rewards/gemini_judge_reward_func/mean": 0.1183035746216774, "rewards/gemini_judge_reward_func/std": 0.23666170239448547, "rewards/semantic_correctness_reward_func/mean": 0.42233186960220337, "rewards/semantic_correctness_reward_func/std": 0.19566462934017181, "rewards/xmlcount_reward_func/mean": 0.6613750457763672, "rewards/xmlcount_reward_func/std": 0.47500187158584595, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 165.80804443359375, "completions/mean_terminated_length": 154.1583709716797, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.22773249114421065, "grad_norm": 0.020004237070679665, "kl": 0.015186309814453125, "learning_rate": 3.0066758668832752e-06, "loss": -0.023, "num_tokens": 237851751.0, "reward": 0.4449497163295746, "reward_std": 0.060879725962877274, "rewards/gemini_judge_reward_func/mean": 0.1037946417927742, "rewards/gemini_judge_reward_func/std": 0.22978128492832184, "rewards/semantic_correctness_reward_func/mean": 0.39502519369125366, "rewards/semantic_correctness_reward_func/std": 0.21134421229362488, "rewards/xmlcount_reward_func/mean": 0.8110670447349548, "rewards/xmlcount_reward_func/std": 0.3861640691757202, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 132.1428680419922, "completions/mean_terminated_length": 128.1435089111328, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.22807391916691563, "grad_norm": 0.02160588651895523, "kl": 0.018302440643310547, "learning_rate": 2.991522876735154e-06, "loss": 0.0054, "num_tokens": 238195039.0, "reward": 0.4097975790500641, "reward_std": 0.07103915512561798, "rewards/gemini_judge_reward_func/mean": 0.1517857164144516, "rewards/gemini_judge_reward_func/std": 0.24227328598499298, "rewards/semantic_correctness_reward_func/mean": 0.4584164023399353, "rewards/semantic_correctness_reward_func/std": 0.2187541127204895, "rewards/xmlcount_reward_func/mean": 0.643500030040741, "rewards/xmlcount_reward_func/std": 0.48071083426475525, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 168.3616180419922, "completions/mean_terminated_length": 156.74661254882812, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2284153471896206, "grad_norm": 0.021200206130743027, "kl": 0.015937328338623047, "learning_rate": 2.9763918509710647e-06, "loss": -0.0123, "num_tokens": 238590452.0, "reward": 0.429336816072464, "reward_std": 0.07005873322486877, "rewards/gemini_judge_reward_func/mean": 0.1450892835855484, "rewards/gemini_judge_reward_func/std": 0.2733252942562103, "rewards/semantic_correctness_reward_func/mean": 0.4533087909221649, "rewards/semantic_correctness_reward_func/std": 0.230568528175354, "rewards/xmlcount_reward_func/mean": 0.7015982270240784, "rewards/xmlcount_reward_func/std": 0.4568972587585449, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 165.20982360839844, "completions/mean_terminated_length": 141.57339477539062, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.22875677521232554, "grad_norm": 0.020127739757299423, "kl": 0.014457941055297852, "learning_rate": 2.9612829550614836e-06, "loss": -0.0052, "num_tokens": 238950803.0, "reward": 0.4565327763557434, "reward_std": 0.07144228368997574, "rewards/gemini_judge_reward_func/mean": 0.1897321492433548, "rewards/gemini_judge_reward_func/std": 0.30179867148399353, "rewards/semantic_correctness_reward_func/mean": 0.4736635982990265, "rewards/semantic_correctness_reward_func/std": 0.24912160634994507, "rewards/xmlcount_reward_func/mean": 0.7147678732872009, "rewards/xmlcount_reward_func/std": 0.45308414101600647, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 164.5, "completions/mean_terminated_length": 156.7567596435547, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.22909820323503052, "grad_norm": 0.020575378090143204, "kl": 0.015639543533325195, "learning_rate": 2.9461963542348737e-06, "loss": -0.0094, "num_tokens": 239289959.0, "reward": 0.4518532156944275, "reward_std": 0.07574018836021423, "rewards/gemini_judge_reward_func/mean": 0.1417410671710968, "rewards/gemini_judge_reward_func/std": 0.24999749660491943, "rewards/semantic_correctness_reward_func/mean": 0.4519446790218353, "rewards/semantic_correctness_reward_func/std": 0.20432503521442413, "rewards/xmlcount_reward_func/mean": 0.7619196772575378, "rewards/xmlcount_reward_func/std": 0.4264376759529114, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 157.3794708251953, "completions/mean_terminated_length": 149.57208251953125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.22943963125773548, "grad_norm": 0.024331238120794296, "kl": 0.014272212982177734, "learning_rate": 2.931132213475884e-06, "loss": -0.015, "num_tokens": 239647616.0, "reward": 0.45021215081214905, "reward_std": 0.0597931370139122, "rewards/gemini_judge_reward_func/mean": 0.1205357164144516, "rewards/gemini_judge_reward_func/std": 0.225807324051857, "rewards/semantic_correctness_reward_func/mean": 0.43698903918266296, "rewards/semantic_correctness_reward_func/std": 0.15982688963413239, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 161.4241180419922, "completions/mean_terminated_length": 145.74090576171875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.22978105928044043, "grad_norm": 0.020820723846554756, "kl": 0.016948699951171875, "learning_rate": 2.9160906975235493e-06, "loss": -0.0398, "num_tokens": 240000263.0, "reward": 0.40477100014686584, "reward_std": 0.06616359949111938, "rewards/gemini_judge_reward_func/mean": 0.1428571492433548, "rewards/gemini_judge_reward_func/std": 0.25873589515686035, "rewards/semantic_correctness_reward_func/mean": 0.4332745373249054, "rewards/semantic_correctness_reward_func/std": 0.19778446853160858, "rewards/xmlcount_reward_func/mean": 0.6524330973625183, "rewards/xmlcount_reward_func/std": 0.4732263386249542, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 175.1294708251953, "completions/mean_terminated_length": 147.7465362548828, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.2301224873031454, "grad_norm": 0.02019997127354145, "kl": 0.014962196350097656, "learning_rate": 2.9010719708694724e-06, "loss": -0.0229, "num_tokens": 240372648.0, "reward": 0.4191317856311798, "reward_std": 0.0710809975862503, "rewards/gemini_judge_reward_func/mean": 0.1361607164144516, "rewards/gemini_judge_reward_func/std": 0.26019221544265747, "rewards/semantic_correctness_reward_func/mean": 0.42908725142478943, "rewards/semantic_correctness_reward_func/std": 0.22505711019039154, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903892993927, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 153.46875, "completions/mean_terminated_length": 149.56503295898438, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.23046391532585037, "grad_norm": 0.019473010674118996, "kl": 0.01583242416381836, "learning_rate": 2.8860761977560435e-06, "loss": -0.0266, "num_tokens": 240725581.0, "reward": 0.4498962461948395, "reward_std": 0.07846318185329437, "rewards/gemini_judge_reward_func/mean": 0.1450892835855484, "rewards/gemini_judge_reward_func/std": 0.24749507009983063, "rewards/semantic_correctness_reward_func/mean": 0.4278828799724579, "rewards/semantic_correctness_reward_func/std": 0.20671235024929047, "rewards/xmlcount_reward_func/mean": 0.7657098770141602, "rewards/xmlcount_reward_func/std": 0.42325305938720703, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 168.2544708251953, "completions/mean_terminated_length": 152.69544982910156, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.23080534334855532, "grad_norm": 0.019685067236423492, "kl": 0.012958049774169922, "learning_rate": 2.871103542174637e-06, "loss": -0.0118, "num_tokens": 241073886.0, "reward": 0.45788535475730896, "reward_std": 0.06220375373959541, "rewards/gemini_judge_reward_func/mean": 0.1428571492433548, "rewards/gemini_judge_reward_func/std": 0.255465567111969, "rewards/semantic_correctness_reward_func/mean": 0.43071243166923523, "rewards/semantic_correctness_reward_func/std": 0.18672212958335876, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 158.4419708251953, "completions/mean_terminated_length": 146.6923065185547, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.2311467713712603, "grad_norm": 0.019380411133170128, "kl": 0.015169143676757812, "learning_rate": 2.8561541678638145e-06, "loss": -0.0014, "num_tokens": 241437493.0, "reward": 0.4243607223033905, "reward_std": 0.06035372614860535, "rewards/gemini_judge_reward_func/mean": 0.1383928507566452, "rewards/gemini_judge_reward_func/std": 0.23158472776412964, "rewards/semantic_correctness_reward_func/mean": 0.4288035035133362, "rewards/semantic_correctness_reward_func/std": 0.19333001971244812, "rewards/xmlcount_reward_func/mean": 0.7081071734428406, "rewards/xmlcount_reward_func/std": 0.45519739389419556, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 160.52232360839844, "completions/mean_terminated_length": 144.82272338867188, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.23148819939396525, "grad_norm": 0.02005195803940296, "kl": 0.015508174896240234, "learning_rate": 2.8412282383075362e-06, "loss": 0.0163, "num_tokens": 241799122.0, "reward": 0.4341495931148529, "reward_std": 0.06551965326070786, "rewards/gemini_judge_reward_func/mean": 0.1417410671710968, "rewards/gemini_judge_reward_func/std": 0.24317796528339386, "rewards/semantic_correctness_reward_func/mean": 0.4572656452655792, "rewards/semantic_correctness_reward_func/std": 0.20034907758235931, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 164.40625, "completions/mean_terminated_length": 148.7772674560547, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.23182962741667024, "grad_norm": 0.02083410508930683, "kl": 0.01825714111328125, "learning_rate": 2.826325916733378e-06, "loss": -0.0167, "num_tokens": 242142693.0, "reward": 0.4272557497024536, "reward_std": 0.06736288964748383, "rewards/gemini_judge_reward_func/mean": 0.1082589253783226, "rewards/gemini_judge_reward_func/std": 0.21504218876361847, "rewards/semantic_correctness_reward_func/mean": 0.42719826102256775, "rewards/semantic_correctness_reward_func/std": 0.22015966475009918, "rewards/xmlcount_reward_func/mean": 0.7462812662124634, "rewards/xmlcount_reward_func/std": 0.4369716942310333, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 159.64732360839844, "completions/mean_terminated_length": 147.91403198242188, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.2321710554393752, "grad_norm": 0.019481897354125977, "kl": 0.013461828231811523, "learning_rate": 2.811447366110741e-06, "loss": -0.0329, "num_tokens": 242488046.0, "reward": 0.44512689113616943, "reward_std": 0.06134679913520813, "rewards/gemini_judge_reward_func/mean": 0.1171875, "rewards/gemini_judge_reward_func/std": 0.22695417702198029, "rewards/semantic_correctness_reward_func/mean": 0.4540092945098877, "rewards/semantic_correctness_reward_func/std": 0.19972579181194305, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 163.79464721679688, "completions/mean_terminated_length": 144.15524291992188, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.23251248346208014, "grad_norm": 0.020205752924084663, "kl": 0.016667842864990234, "learning_rate": 2.796592749149071e-06, "loss": -0.0165, "num_tokens": 242860544.0, "reward": 0.4280025064945221, "reward_std": 0.07606809586286545, "rewards/gemini_judge_reward_func/mean": 0.1316964328289032, "rewards/gemini_judge_reward_func/std": 0.2656741440296173, "rewards/semantic_correctness_reward_func/mean": 0.4257535934448242, "rewards/semantic_correctness_reward_func/std": 0.19858862459659576, "rewards/xmlcount_reward_func/mean": 0.7254330515861511, "rewards/xmlcount_reward_func/std": 0.446159690618515, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 163.28125, "completions/mean_terminated_length": 143.630126953125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.23285391148478513, "grad_norm": 0.021477343514561653, "kl": 0.024187564849853516, "learning_rate": 2.7817622282960816e-06, "loss": -0.0087, "num_tokens": 243204363.0, "reward": 0.4583207070827484, "reward_std": 0.07647057622671127, "rewards/gemini_judge_reward_func/mean": 0.1662946492433548, "rewards/gemini_judge_reward_func/std": 0.2740088105201721, "rewards/semantic_correctness_reward_func/mean": 0.44556760787963867, "rewards/semantic_correctness_reward_func/std": 0.21110759675502777, "rewards/xmlcount_reward_func/mean": 0.7567232847213745, "rewards/xmlcount_reward_func/std": 0.4276689887046814, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 167.50894165039062, "completions/mean_terminated_length": 143.93577575683594, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.23319533950749008, "grad_norm": 0.02061975747346878, "kl": 0.019166946411132812, "learning_rate": 2.766955965735968e-06, "loss": -0.047, "num_tokens": 243585773.0, "reward": 0.4028172791004181, "reward_std": 0.06361385434865952, "rewards/gemini_judge_reward_func/mean": 0.1450892835855484, "rewards/gemini_judge_reward_func/std": 0.27023160457611084, "rewards/semantic_correctness_reward_func/mean": 0.4413006007671356, "rewards/semantic_correctness_reward_func/std": 0.1913142055273056, "rewards/xmlcount_reward_func/mean": 0.6413035988807678, "rewards/xmlcount_reward_func/std": 0.48018917441368103, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 153.92857360839844, "completions/mean_terminated_length": 150.02691650390625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.23353676753019503, "grad_norm": 0.020191872492432594, "kl": 0.014515876770019531, "learning_rate": 2.7521741233876496e-06, "loss": 0.0003, "num_tokens": 243900057.0, "reward": 0.46831250190734863, "reward_std": 0.0564018189907074, "rewards/gemini_judge_reward_func/mean": 0.1796875, "rewards/gemini_judge_reward_func/std": 0.3019375205039978, "rewards/semantic_correctness_reward_func/mean": 0.4449373781681061, "rewards/semantic_correctness_reward_func/std": 0.2366136610507965, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 148.47769165039062, "completions/mean_terminated_length": 144.55157470703125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.23387819555290001, "grad_norm": 0.02208324708044529, "kl": 0.01886892318725586, "learning_rate": 2.7374168629029814e-06, "loss": -0.0058, "num_tokens": 244234772.0, "reward": 0.4652222692966461, "reward_std": 0.06461701542139053, "rewards/gemini_judge_reward_func/mean": 0.1417410671710968, "rewards/gemini_judge_reward_func/std": 0.2745744585990906, "rewards/semantic_correctness_reward_func/mean": 0.4338791072368622, "rewards/semantic_correctness_reward_func/std": 0.23112650215625763, "rewards/xmlcount_reward_func/mean": 0.8043751120567322, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 177.38394165039062, "completions/mean_terminated_length": 154.0825653076172, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.23421962357560497, "grad_norm": 0.020363980904221535, "kl": 0.015145301818847656, "learning_rate": 2.722684345665004e-06, "loss": -0.0154, "num_tokens": 244596058.0, "reward": 0.44785916805267334, "reward_std": 0.0589757114648819, "rewards/gemini_judge_reward_func/mean": 0.1383928507566452, "rewards/gemini_judge_reward_func/std": 0.24794963002204895, "rewards/semantic_correctness_reward_func/mean": 0.42526012659072876, "rewards/semantic_correctness_reward_func/std": 0.21264635026454926, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 160.88394165039062, "completions/mean_terminated_length": 145.19090270996094, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.23456105159830992, "grad_norm": 0.02010117843747139, "kl": 0.013428688049316406, "learning_rate": 2.707976732786166e-06, "loss": -0.006, "num_tokens": 244951076.0, "reward": 0.46008288860321045, "reward_std": 0.06361074000597, "rewards/gemini_judge_reward_func/mean": 0.109375, "rewards/gemini_judge_reward_func/std": 0.21381975710391998, "rewards/semantic_correctness_reward_func/mean": 0.43716415762901306, "rewards/semantic_correctness_reward_func/std": 0.2125110626220703, "rewards/xmlcount_reward_func/mean": 0.8222500681877136, "rewards/xmlcount_reward_func/std": 0.3842346966266632, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 169.5044708251953, "completions/mean_terminated_length": 153.96817016601562, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.2349024796210149, "grad_norm": 0.01886354200541973, "kl": 0.015913724899291992, "learning_rate": 2.693294185106562e-06, "loss": 0.0221, "num_tokens": 245285709.0, "reward": 0.45489609241485596, "reward_std": 0.06527674198150635, "rewards/gemini_judge_reward_func/mean": 0.1127232164144516, "rewards/gemini_judge_reward_func/std": 0.22675560414791107, "rewards/semantic_correctness_reward_func/mean": 0.4760338366031647, "rewards/semantic_correctness_reward_func/std": 0.19940443336963654, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 162.21875, "completions/mean_terminated_length": 138.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.23524390764371986, "grad_norm": 0.02161382883787155, "kl": 0.01592111587524414, "learning_rate": 2.678636863192184e-06, "loss": 0.0057, "num_tokens": 245645274.0, "reward": 0.44552478194236755, "reward_std": 0.06137440726161003, "rewards/gemini_judge_reward_func/mean": 0.0959821417927742, "rewards/gemini_judge_reward_func/std": 0.19305641949176788, "rewards/semantic_correctness_reward_func/mean": 0.4626862704753876, "rewards/semantic_correctness_reward_func/std": 0.20750680565834045, "rewards/xmlcount_reward_func/mean": 0.7864866256713867, "rewards/xmlcount_reward_func/std": 0.4116491377353668, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 149.97769165039062, "completions/mean_terminated_length": 138.11312866210938, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.2355853356664248, "grad_norm": 0.022082503885030746, "kl": 0.016037464141845703, "learning_rate": 2.6640049273331516e-06, "loss": 0.024, "num_tokens": 246015097.0, "reward": 0.42489418387413025, "reward_std": 0.07650549709796906, "rewards/gemini_judge_reward_func/mean": 0.1741071492433548, "rewards/gemini_judge_reward_func/std": 0.2944541275501251, "rewards/semantic_correctness_reward_func/mean": 0.4535064101219177, "rewards/semantic_correctness_reward_func/std": 0.22471977770328522, "rewards/xmlcount_reward_func/mean": 0.6613750457763672, "rewards/xmlcount_reward_func/std": 0.47500187158584595, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 152.14732360839844, "completions/mean_terminated_length": 148.2376708984375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.2359267636891298, "grad_norm": 0.020293327048420906, "kl": 0.014290809631347656, "learning_rate": 2.649398537541978e-06, "loss": -0.0307, "num_tokens": 246355270.0, "reward": 0.44305023550987244, "reward_std": 0.06119895726442337, "rewards/gemini_judge_reward_func/mean": 0.1607142835855484, "rewards/gemini_judge_reward_func/std": 0.2944881021976471, "rewards/semantic_correctness_reward_func/mean": 0.48290279507637024, "rewards/semantic_correctness_reward_func/std": 0.22012221813201904, "rewards/xmlcount_reward_func/mean": 0.7054598927497864, "rewards/xmlcount_reward_func/std": 0.455239862203598, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 168.16519165039062, "completions/mean_terminated_length": 148.6255645751953, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.23626819171183475, "grad_norm": 0.021590879186987877, "kl": 0.015369415283203125, "learning_rate": 2.6348178535517967e-06, "loss": 0.0018, "num_tokens": 246698131.0, "reward": 0.4446691870689392, "reward_std": 0.07133954018354416, "rewards/gemini_judge_reward_func/mean": 0.1339285671710968, "rewards/gemini_judge_reward_func/std": 0.249278262257576, "rewards/semantic_correctness_reward_func/mean": 0.41823869943618774, "rewards/semantic_correctness_reward_func/std": 0.191751629114151, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 169.8482208251953, "completions/mean_terminated_length": 154.31817626953125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.2366096197345397, "grad_norm": 0.020112143829464912, "kl": 0.013370513916015625, "learning_rate": 2.6202630348146323e-06, "loss": 0.0207, "num_tokens": 247018465.0, "reward": 0.4661848545074463, "reward_std": 0.06909771263599396, "rewards/gemini_judge_reward_func/mean": 0.1316964328289032, "rewards/gemini_judge_reward_func/std": 0.21297545731067657, "rewards/semantic_correctness_reward_func/mean": 0.45878109335899353, "rewards/semantic_correctness_reward_func/std": 0.19541005790233612, "rewards/xmlcount_reward_func/mean": 0.8043750524520874, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 169.07144165039062, "completions/mean_terminated_length": 141.4930877685547, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.23695104775724468, "grad_norm": 0.01998029090464115, "kl": 0.016425132751464844, "learning_rate": 2.605734240499652e-06, "loss": -0.0034, "num_tokens": 247406061.0, "reward": 0.38997000455856323, "reward_std": 0.06167588382959366, "rewards/gemini_judge_reward_func/mean": 0.1104910746216774, "rewards/gemini_judge_reward_func/std": 0.22035281360149384, "rewards/semantic_correctness_reward_func/mean": 0.4418678879737854, "rewards/semantic_correctness_reward_func/std": 0.2028319090604782, "rewards/xmlcount_reward_func/mean": 0.643500030040741, "rewards/xmlcount_reward_func/std": 0.48071083426475525, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 157.0491180419922, "completions/mean_terminated_length": 141.28636169433594, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.23729247577994964, "grad_norm": 0.020776091143488884, "kl": 0.015616416931152344, "learning_rate": 2.5912316294914232e-06, "loss": -0.0041, "num_tokens": 247761004.0, "reward": 0.440807580947876, "reward_std": 0.06188865751028061, "rewards/gemini_judge_reward_func/mean": 0.1272321492433548, "rewards/gemini_judge_reward_func/std": 0.2582615911960602, "rewards/semantic_correctness_reward_func/mean": 0.41232332587242126, "rewards/semantic_correctness_reward_func/std": 0.23134461045265198, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 153.66964721679688, "completions/mean_terminated_length": 145.82882690429688, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.2376339038026546, "grad_norm": 0.021136008203029633, "kl": 0.01209259033203125, "learning_rate": 2.576755360388177e-06, "loss": -0.0243, "num_tokens": 248112446.0, "reward": 0.4230984151363373, "reward_std": 0.06140593811869621, "rewards/gemini_judge_reward_func/mean": 0.1004464253783226, "rewards/gemini_judge_reward_func/std": 0.1936776041984558, "rewards/semantic_correctness_reward_func/mean": 0.4130990207195282, "rewards/semantic_correctness_reward_func/std": 0.17426711320877075, "rewards/xmlcount_reward_func/mean": 0.7507500648498535, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 159.65179443359375, "completions/mean_terminated_length": 143.9363555908203, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.23797533182535957, "grad_norm": 0.020741021260619164, "kl": 0.01749563217163086, "learning_rate": 2.562305591500069e-06, "loss": -0.0072, "num_tokens": 248460224.0, "reward": 0.43691644072532654, "reward_std": 0.06515128910541534, "rewards/gemini_judge_reward_func/mean": 0.140625, "rewards/gemini_judge_reward_func/std": 0.27565911412239075, "rewards/semantic_correctness_reward_func/mean": 0.43758201599121094, "rewards/semantic_correctness_reward_func/std": 0.23180371522903442, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 162.0669708251953, "completions/mean_terminated_length": 154.3018035888672, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.23831675984806452, "grad_norm": 0.021172866225242615, "kl": 0.01595783233642578, "learning_rate": 2.5478824808474613e-06, "loss": -0.0077, "num_tokens": 248800735.0, "reward": 0.48907050490379333, "reward_std": 0.07695534080266953, "rewards/gemini_judge_reward_func/mean": 0.1662946492433548, "rewards/gemini_judge_reward_func/std": 0.2830647826194763, "rewards/semantic_correctness_reward_func/mean": 0.5040131211280823, "rewards/semantic_correctness_reward_func/std": 0.20649151504039764, "rewards/xmlcount_reward_func/mean": 0.8043750524520874, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 163.19644165039062, "completions/mean_terminated_length": 135.42857360839844, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.2386581878707695, "grad_norm": 0.020416466519236565, "kl": 0.016710758209228516, "learning_rate": 2.5334861861591753e-06, "loss": -0.0458, "num_tokens": 249173807.0, "reward": 0.38040465116500854, "reward_std": 0.0654895082116127, "rewards/gemini_judge_reward_func/mean": 0.1395089328289032, "rewards/gemini_judge_reward_func/std": 0.27367985248565674, "rewards/semantic_correctness_reward_func/mean": 0.4075053632259369, "rewards/semantic_correctness_reward_func/std": 0.22781768441200256, "rewards/xmlcount_reward_func/mean": 0.6077500581741333, "rewards/xmlcount_reward_func/std": 0.48996880650520325, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 157.02679443359375, "completions/mean_terminated_length": 153.13902282714844, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.23899961589347446, "grad_norm": 0.020300550386309624, "kl": 0.01459813117980957, "learning_rate": 2.5191168648707888e-06, "loss": -0.0093, "num_tokens": 249524129.0, "reward": 0.4497314989566803, "reward_std": 0.06387756019830704, "rewards/gemini_judge_reward_func/mean": 0.1261160671710968, "rewards/gemini_judge_reward_func/std": 0.22584888339042664, "rewards/semantic_correctness_reward_func/mean": 0.45917510986328125, "rewards/semantic_correctness_reward_func/std": 0.22504274547100067, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 900.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 144.76339721679688, "completions/mean_terminated_length": 144.76339721679688, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.2393410439161794, "grad_norm": 0.020862894132733345, "kl": 0.01591944694519043, "learning_rate": 2.5047746741228977e-06, "loss": -0.0147, "num_tokens": 249899300.0, "reward": 0.4328038990497589, "reward_std": 0.0568770207464695, "rewards/gemini_judge_reward_func/mean": 0.1116071417927742, "rewards/gemini_judge_reward_func/std": 0.21397185325622559, "rewards/semantic_correctness_reward_func/mean": 0.43934059143066406, "rewards/semantic_correctness_reward_func/std": 0.2161037027835846, "rewards/xmlcount_reward_func/mean": 0.7507321238517761, "rewards/xmlcount_reward_func/std": 0.43440625071525574, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 156.40625, "completions/mean_terminated_length": 144.62896728515625, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.2396824719388844, "grad_norm": 0.020461153239011765, "kl": 0.020430326461791992, "learning_rate": 2.490459770759398e-06, "loss": -0.0092, "num_tokens": 250251623.0, "reward": 0.4267743229866028, "reward_std": 0.053675100207328796, "rewards/gemini_judge_reward_func/mean": 0.1506696492433548, "rewards/gemini_judge_reward_func/std": 0.2799535393714905, "rewards/semantic_correctness_reward_func/mean": 0.4382820129394531, "rewards/semantic_correctness_reward_func/std": 0.22525343298912048, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903594970703, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 155.1294708251953, "completions/mean_terminated_length": 143.33485412597656, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.24002389996158935, "grad_norm": 0.022830575704574585, "kl": 0.015569210052490234, "learning_rate": 2.476172311325783e-06, "loss": -0.0099, "num_tokens": 250598840.0, "reward": 0.46036437153816223, "reward_std": 0.0712478905916214, "rewards/gemini_judge_reward_func/mean": 0.1640625, "rewards/gemini_judge_reward_func/std": 0.28239211440086365, "rewards/semantic_correctness_reward_func/mean": 0.4364466667175293, "rewards/semantic_correctness_reward_func/std": 0.21228350698947906, "rewards/xmlcount_reward_func/mean": 0.7686249613761902, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 157.1294708251953, "completions/mean_terminated_length": 149.31982421875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.2403653279842943, "grad_norm": 0.022787317633628845, "kl": 0.017244815826416016, "learning_rate": 2.461912452067415e-06, "loss": -0.0304, "num_tokens": 250945797.0, "reward": 0.4652026295661926, "reward_std": 0.06580142676830292, "rewards/gemini_judge_reward_func/mean": 0.1450892835855484, "rewards/gemini_judge_reward_func/std": 0.27536845207214355, "rewards/semantic_correctness_reward_func/mean": 0.4717719852924347, "rewards/semantic_correctness_reward_func/std": 0.23088571429252625, "rewards/xmlcount_reward_func/mean": 0.782031238079071, "rewards/xmlcount_reward_func/std": 0.41473886370658875, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 149.07144165039062, "completions/mean_terminated_length": 133.16363525390625, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.24070675600699928, "grad_norm": 0.02126486226916313, "kl": 0.01680445671081543, "learning_rate": 2.447680348927837e-06, "loss": -0.0002, "num_tokens": 251297597.0, "reward": 0.3836461007595062, "reward_std": 0.047221578657627106, "rewards/gemini_judge_reward_func/mean": 0.1104910746216774, "rewards/gemini_judge_reward_func/std": 0.21389874815940857, "rewards/semantic_correctness_reward_func/mean": 0.374498188495636, "rewards/semantic_correctness_reward_func/std": 0.17782600224018097, "rewards/xmlcount_reward_func/mean": 0.6613750457763672, "rewards/xmlcount_reward_func/std": 0.47500184178352356, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 166.7678680419922, "completions/mean_terminated_length": 147.1963348388672, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.24104818402970424, "grad_norm": 0.019322630017995834, "kl": 0.015539407730102539, "learning_rate": 2.433476157547044e-06, "loss": -0.008, "num_tokens": 251664097.0, "reward": 0.42840591073036194, "reward_std": 0.07416719198226929, "rewards/gemini_judge_reward_func/mean": 0.1986607164144516, "rewards/gemini_judge_reward_func/std": 0.3125520348548889, "rewards/semantic_correctness_reward_func/mean": 0.4934581220149994, "rewards/semantic_correctness_reward_func/std": 0.23917879164218903, "rewards/xmlcount_reward_func/mean": 0.6256250739097595, "rewards/xmlcount_reward_func/std": 0.48569241166114807, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 154.90625, "completions/mean_terminated_length": 151.00897216796875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2413896120524092, "grad_norm": 0.0201317947357893, "kl": 0.01163625717163086, "learning_rate": 2.4193000332597984e-06, "loss": -0.0167, "num_tokens": 251992972.0, "reward": 0.4662090837955475, "reward_std": 0.06485062837600708, "rewards/gemini_judge_reward_func/mean": 0.1261160671710968, "rewards/gemini_judge_reward_func/std": 0.2437535524368286, "rewards/semantic_correctness_reward_func/mean": 0.434313029050827, "rewards/semantic_correctness_reward_func/std": 0.20111876726150513, "rewards/xmlcount_reward_func/mean": 0.8222500681877136, "rewards/xmlcount_reward_func/std": 0.3842346966266632, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 169.05357360839844, "completions/mean_terminated_length": 153.5090789794922, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.24173104007511417, "grad_norm": 0.0201317947357893, "kl": 0.015091657638549805, "learning_rate": 2.4193000332597984e-06, "loss": 0.0044, "num_tokens": 252365404.0, "reward": 0.4412159323692322, "reward_std": 0.06695268303155899, "rewards/gemini_judge_reward_func/mean": 0.1506696492433548, "rewards/gemini_judge_reward_func/std": 0.2718265652656555, "rewards/semantic_correctness_reward_func/mean": 0.47249937057495117, "rewards/semantic_correctness_reward_func/std": 0.22053788602352142, "rewards/xmlcount_reward_func/mean": 0.7161206007003784, "rewards/xmlcount_reward_func/std": 0.4517506957054138, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 147.90625, "completions/mean_terminated_length": 143.9775848388672, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.24207246809781913, "grad_norm": 0.019735103473067284, "kl": 0.017200469970703125, "learning_rate": 2.4051521310939258e-06, "loss": 0.0044, "num_tokens": 252725691.0, "reward": 0.4373900890350342, "reward_std": 0.07390647381544113, "rewards/gemini_judge_reward_func/mean": 0.1049107164144516, "rewards/gemini_judge_reward_func/std": 0.22370301187038422, "rewards/semantic_correctness_reward_func/mean": 0.404128760099411, "rewards/semantic_correctness_reward_func/std": 0.1993720382452011, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0401785714285714, "completions/max_length": 1024.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 180.4553680419922, "completions/mean_terminated_length": 145.14418029785156, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.24241389612052408, "grad_norm": 0.01987229473888874, "kl": 0.015630483627319336, "learning_rate": 2.391032605768613e-06, "loss": -0.0101, "num_tokens": 253089801.0, "reward": 0.4440220594406128, "reward_std": 0.07133130729198456, "rewards/gemini_judge_reward_func/mean": 0.1607142835855484, "rewards/gemini_judge_reward_func/std": 0.2778382897377014, "rewards/semantic_correctness_reward_func/mean": 0.47985124588012695, "rewards/semantic_correctness_reward_func/std": 0.21601252257823944, "rewards/xmlcount_reward_func/mean": 0.7094151377677917, "rewards/xmlcount_reward_func/std": 0.4536706507205963, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 169.70982360839844, "completions/mean_terminated_length": 142.15206909179688, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.24275532414322906, "grad_norm": 0.02286173216998577, "kl": 0.019670486450195312, "learning_rate": 2.3769416116927335e-06, "loss": -0.0615, "num_tokens": 253479884.0, "reward": 0.3929518163204193, "reward_std": 0.05669989064335823, "rewards/gemini_judge_reward_func/mean": 0.109375, "rewards/gemini_judge_reward_func/std": 0.24781839549541473, "rewards/semantic_correctness_reward_func/mean": 0.4232589900493622, "rewards/semantic_correctness_reward_func/std": 0.19399282336235046, "rewards/xmlcount_reward_func/mean": 0.6613750457763672, "rewards/xmlcount_reward_func/std": 0.47500187158584595, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 159.66519165039062, "completions/mean_terminated_length": 147.93212890625, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.24309675216593402, "grad_norm": 0.019998321309685707, "kl": 0.01382303237915039, "learning_rate": 2.3628793029631353e-06, "loss": -0.0188, "num_tokens": 253810629.0, "reward": 0.4336186945438385, "reward_std": 0.05938807874917984, "rewards/gemini_judge_reward_func/mean": 0.1082589253783226, "rewards/gemini_judge_reward_func/std": 0.20976413786411285, "rewards/semantic_correctness_reward_func/mean": 0.40090590715408325, "rewards/semantic_correctness_reward_func/std": 0.19256816804409027, "rewards/xmlcount_reward_func/mean": 0.7753348350524902, "rewards/xmlcount_reward_func/std": 0.4151875972747803, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 149.82589721679688, "completions/mean_terminated_length": 149.82589721679688, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.24343818018863897, "grad_norm": 0.02148658223450184, "kl": 0.014288663864135742, "learning_rate": 2.3488458333629777e-06, "loss": 0.013, "num_tokens": 254159642.0, "reward": 0.47455543279647827, "reward_std": 0.06833560764789581, "rewards/gemini_judge_reward_func/mean": 0.1171875, "rewards/gemini_judge_reward_func/std": 0.22695417702198029, "rewards/semantic_correctness_reward_func/mean": 0.42240217328071594, "rewards/semantic_correctness_reward_func/std": 0.2225671112537384, "rewards/xmlcount_reward_func/mean": 0.8580000996589661, "rewards/xmlcount_reward_func/std": 0.35106155276298523, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 148.87054443359375, "completions/mean_terminated_length": 140.9864959716797, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.24377960821134395, "grad_norm": 0.020947393029928207, "kl": 0.01793670654296875, "learning_rate": 2.3348413563600324e-06, "loss": 0.0081, "num_tokens": 254523265.0, "reward": 0.42428773641586304, "reward_std": 0.06304830312728882, "rewards/gemini_judge_reward_func/mean": 0.1473214328289032, "rewards/gemini_judge_reward_func/std": 0.2802416682243347, "rewards/semantic_correctness_reward_func/mean": 0.43254581093788147, "rewards/semantic_correctness_reward_func/std": 0.20892907679080963, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903594970703, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 169.25894165039062, "completions/mean_terminated_length": 141.68663024902344, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.2441210362340489, "grad_norm": 0.019883327186107635, "kl": 0.014074325561523438, "learning_rate": 2.320866025105016e-06, "loss": -0.0148, "num_tokens": 254898227.0, "reward": 0.39803096652030945, "reward_std": 0.06506957113742828, "rewards/gemini_judge_reward_func/mean": 0.1272321492433548, "rewards/gemini_judge_reward_func/std": 0.21568416059017181, "rewards/semantic_correctness_reward_func/mean": 0.4475386440753937, "rewards/semantic_correctness_reward_func/std": 0.20398251712322235, "rewards/xmlcount_reward_func/mean": 0.6440759301185608, "rewards/xmlcount_reward_func/std": 0.4800132215023041, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 166.8794708251953, "completions/mean_terminated_length": 151.2954559326172, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.2444624642567539, "grad_norm": 0.019473331049084663, "kl": 0.014130592346191406, "learning_rate": 2.3069199924299175e-06, "loss": -0.0324, "num_tokens": 255253776.0, "reward": 0.45115411281585693, "reward_std": 0.07554265856742859, "rewards/gemini_judge_reward_func/mean": 0.1495535671710968, "rewards/gemini_judge_reward_func/std": 0.2656741738319397, "rewards/semantic_correctness_reward_func/mean": 0.4472883641719818, "rewards/semantic_correctness_reward_func/std": 0.21093684434890747, "rewards/xmlcount_reward_func/mean": 0.7546876072883606, "rewards/xmlcount_reward_func/std": 0.42363569140434265, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 160.46429443359375, "completions/mean_terminated_length": 148.74208068847656, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.24480389227945884, "grad_norm": 0.022255755960941315, "kl": 0.012172698974609375, "learning_rate": 2.29300341084631e-06, "loss": -0.0087, "num_tokens": 255613864.0, "reward": 0.48112553358078003, "reward_std": 0.06726916879415512, "rewards/gemini_judge_reward_func/mean": 0.1897321492433548, "rewards/gemini_judge_reward_func/std": 0.3276248574256897, "rewards/semantic_correctness_reward_func/mean": 0.48891332745552063, "rewards/semantic_correctness_reward_func/std": 0.24710066616535187, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 167.55804443359375, "completions/mean_terminated_length": 151.98635864257812, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.2451453203021638, "grad_norm": 0.01919080689549446, "kl": 0.016460418701171875, "learning_rate": 2.2791164325437047e-06, "loss": 0.014, "num_tokens": 255962745.0, "reward": 0.43085789680480957, "reward_std": 0.06295596063137054, "rewards/gemini_judge_reward_func/mean": 0.1651785671710968, "rewards/gemini_judge_reward_func/std": 0.2900702953338623, "rewards/semantic_correctness_reward_func/mean": 0.4296819865703583, "rewards/semantic_correctness_reward_func/std": 0.2027299851179123, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903892993927, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 161.13394165039062, "completions/mean_terminated_length": 145.44544982910156, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.24548674832486878, "grad_norm": 0.021129751577973366, "kl": 0.017368555068969727, "learning_rate": 2.265259209387867e-06, "loss": -0.0283, "num_tokens": 256321887.0, "reward": 0.4192156493663788, "reward_std": 0.07224642485380173, "rewards/gemini_judge_reward_func/mean": 0.1238839253783226, "rewards/gemini_judge_reward_func/std": 0.2117588222026825, "rewards/semantic_correctness_reward_func/mean": 0.4812476933002472, "rewards/semantic_correctness_reward_func/std": 0.20846113562583923, "rewards/xmlcount_reward_func/mean": 0.6835312843322754, "rewards/xmlcount_reward_func/std": 0.47700417041778564, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 163.7991180419922, "completions/mean_terminated_length": 159.94171142578125, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.24582817634757373, "grad_norm": 0.020381838083267212, "kl": 0.013567447662353516, "learning_rate": 2.2514318929191707e-06, "loss": -0.0257, "num_tokens": 256673270.0, "reward": 0.4235455095767975, "reward_std": 0.07512028515338898, "rewards/gemini_judge_reward_func/mean": 0.1138392835855484, "rewards/gemini_judge_reward_func/std": 0.21146614849567413, "rewards/semantic_correctness_reward_func/mean": 0.42429882287979126, "rewards/semantic_correctness_reward_func/std": 0.1959509402513504, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 154.12054443359375, "completions/mean_terminated_length": 146.28378295898438, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.24616960437027868, "grad_norm": 0.021473728120326996, "kl": 0.017746925354003906, "learning_rate": 2.2376346343509343e-06, "loss": 0.0081, "num_tokens": 257008805.0, "reward": 0.45415744185447693, "reward_std": 0.0672735646367073, "rewards/gemini_judge_reward_func/mean": 0.1372767835855484, "rewards/gemini_judge_reward_func/std": 0.22551624476909637, "rewards/semantic_correctness_reward_func/mean": 0.4321710169315338, "rewards/semantic_correctness_reward_func/std": 0.21166691184043884, "rewards/xmlcount_reward_func/mean": 0.782031238079071, "rewards/xmlcount_reward_func/std": 0.41473886370658875, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 172.3794708251953, "completions/mean_terminated_length": 144.9078369140625, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.24651103239298366, "grad_norm": 0.020387083292007446, "kl": 0.014590740203857422, "learning_rate": 2.2238675845677663e-06, "loss": -0.0239, "num_tokens": 257348598.0, "reward": 0.4348061680793762, "reward_std": 0.04860123619437218, "rewards/gemini_judge_reward_func/mean": 0.1328125, "rewards/gemini_judge_reward_func/std": 0.2424771934747696, "rewards/semantic_correctness_reward_func/mean": 0.44265562295913696, "rewards/semantic_correctness_reward_func/std": 0.21456462144851685, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 150.91964721679688, "completions/mean_terminated_length": 135.0454559326172, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.24685246041568862, "grad_norm": 0.024489011615514755, "kl": 0.017383098602294922, "learning_rate": 2.2101308941239204e-06, "loss": -0.0112, "num_tokens": 257698000.0, "reward": 0.4203735888004303, "reward_std": 0.07543018460273743, "rewards/gemini_judge_reward_func/mean": 0.1584821492433548, "rewards/gemini_judge_reward_func/std": 0.2928435206413269, "rewards/semantic_correctness_reward_func/mean": 0.4264035224914551, "rewards/semantic_correctness_reward_func/std": 0.22055880725383759, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 157.49554443359375, "completions/mean_terminated_length": 149.68919372558594, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.24719388843839357, "grad_norm": 0.022120485082268715, "kl": 0.013323783874511719, "learning_rate": 2.1964247132416373e-06, "loss": -0.0106, "num_tokens": 258037219.0, "reward": 0.4730894863605499, "reward_std": 0.060390159487724304, "rewards/gemini_judge_reward_func/mean": 0.1495535671710968, "rewards/gemini_judge_reward_func/std": 0.2635558545589447, "rewards/semantic_correctness_reward_func/mean": 0.4575989544391632, "rewards/semantic_correctness_reward_func/std": 0.22946356236934662, "rewards/xmlcount_reward_func/mean": 0.8043705821037292, "rewards/xmlcount_reward_func/std": 0.3985821604728699, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 158.94644165039062, "completions/mean_terminated_length": 143.21817016601562, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.24753531646109855, "grad_norm": 0.02180694416165352, "kl": 0.017840862274169922, "learning_rate": 2.182749191809518e-06, "loss": 0.0194, "num_tokens": 258411615.0, "reward": 0.43769633769989014, "reward_std": 0.07489373534917831, "rewards/gemini_judge_reward_func/mean": 0.1484375, "rewards/gemini_judge_reward_func/std": 0.28412362933158875, "rewards/semantic_correctness_reward_func/mean": 0.4616064429283142, "rewards/semantic_correctness_reward_func/std": 0.21261639893054962, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 161.68304443359375, "completions/mean_terminated_length": 141.99542236328125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.2478767444838035, "grad_norm": 0.022032292559742928, "kl": 0.015535354614257812, "learning_rate": 2.1691044793808734e-06, "loss": -0.0172, "num_tokens": 258770296.0, "reward": 0.4378298223018646, "reward_std": 0.07478249073028564, "rewards/gemini_judge_reward_func/mean": 0.140625, "rewards/gemini_judge_reward_func/std": 0.26635146141052246, "rewards/semantic_correctness_reward_func/mean": 0.4556399881839752, "rewards/semantic_correctness_reward_func/std": 0.23665642738342285, "rewards/xmlcount_reward_func/mean": 0.7261295318603516, "rewards/xmlcount_reward_func/std": 0.44144105911254883, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 150.37054443359375, "completions/mean_terminated_length": 138.51132202148438, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.24821817250650846, "grad_norm": 0.022914161905646324, "kl": 0.018914222717285156, "learning_rate": 2.1554907251720947e-06, "loss": 0.0018, "num_tokens": 259124663.0, "reward": 0.4364605247974396, "reward_std": 0.06964661180973053, "rewards/gemini_judge_reward_func/mean": 0.1841517835855484, "rewards/gemini_judge_reward_func/std": 0.2906714081764221, "rewards/semantic_correctness_reward_func/mean": 0.45102569460868835, "rewards/semantic_correctness_reward_func/std": 0.2021547555923462, "rewards/xmlcount_reward_func/mean": 0.6814866662025452, "rewards/xmlcount_reward_func/std": 0.46647319197654724, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 177.90626525878906, "completions/mean_terminated_length": 150.61289978027344, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.24855960052921344, "grad_norm": 0.019710304215550423, "kl": 0.016555309295654297, "learning_rate": 2.1419080780610123e-06, "loss": -0.0271, "num_tokens": 259495574.0, "reward": 0.42523664236068726, "reward_std": 0.06002560257911682, "rewards/gemini_judge_reward_func/mean": 0.1138392835855484, "rewards/gemini_judge_reward_func/std": 0.23530438542366028, "rewards/semantic_correctness_reward_func/mean": 0.4327545166015625, "rewards/semantic_correctness_reward_func/std": 0.2105783075094223, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 168.3482208251953, "completions/mean_terminated_length": 148.81277465820312, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.2489010285519184, "grad_norm": 0.020126277580857277, "kl": 0.015465736389160156, "learning_rate": 2.1283566865852824e-06, "loss": 0.0194, "num_tokens": 259854848.0, "reward": 0.4410649836063385, "reward_std": 0.08376999199390411, "rewards/gemini_judge_reward_func/mean": 0.1685267835855484, "rewards/gemini_judge_reward_func/std": 0.2991064190864563, "rewards/semantic_correctness_reward_func/mean": 0.4465034008026123, "rewards/semantic_correctness_reward_func/std": 0.20890836417675018, "rewards/xmlcount_reward_func/mean": 0.7108839750289917, "rewards/xmlcount_reward_func/std": 0.45479342341423035, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 709.0, "completions/mean_length": 154.7053680419922, "completions/mean_terminated_length": 146.87387084960938, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.24924245657462335, "grad_norm": 0.02328014187514782, "kl": 0.01580524444580078, "learning_rate": 2.11483669894075e-06, "loss": 0.0061, "num_tokens": 260216594.0, "reward": 0.42892396450042725, "reward_std": 0.06756206601858139, "rewards/gemini_judge_reward_func/mean": 0.1584821492433548, "rewards/gemini_judge_reward_func/std": 0.28801846504211426, "rewards/semantic_correctness_reward_func/mean": 0.4334053695201874, "rewards/semantic_correctness_reward_func/std": 0.23770850896835327, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903594970703, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 156.73214721679688, "completions/mean_terminated_length": 144.95928955078125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.24958388459732833, "grad_norm": 0.020609304308891296, "kl": 0.01523447036743164, "learning_rate": 2.1013482629798334e-06, "loss": -0.0081, "num_tokens": 260574246.0, "reward": 0.3963780105113983, "reward_std": 0.05536142736673355, "rewards/gemini_judge_reward_func/mean": 0.1060267835855484, "rewards/gemini_judge_reward_func/std": 0.23717662692070007, "rewards/semantic_correctness_reward_func/mean": 0.4024167060852051, "rewards/semantic_correctness_reward_func/std": 0.22327245771884918, "rewards/xmlcount_reward_func/mean": 0.6837098002433777, "rewards/xmlcount_reward_func/std": 0.46379369497299194, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 161.20982360839844, "completions/mean_terminated_length": 153.4369354248047, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.24992531262003329, "grad_norm": 0.01949228160083294, "kl": 0.014344215393066406, "learning_rate": 2.08789152620991e-06, "loss": -0.0066, "num_tokens": 260925937.0, "reward": 0.4333937168121338, "reward_std": 0.06455694884061813, "rewards/gemini_judge_reward_func/mean": 0.1138392835855484, "rewards/gemini_judge_reward_func/std": 0.22927159070968628, "rewards/semantic_correctness_reward_func/mean": 0.4036467969417572, "rewards/semantic_correctness_reward_func/std": 0.23052960634231567, "rewards/xmlcount_reward_func/mean": 0.7678214311599731, "rewards/xmlcount_reward_func/std": 0.4232962429523468, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 169.80804443359375, "completions/mean_terminated_length": 146.29815673828125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.25026674064273824, "grad_norm": 0.020751619711518288, "kl": 0.01793694496154785, "learning_rate": 2.0744666357916925e-06, "loss": -0.0265, "num_tokens": 261278230.0, "reward": 0.4312310814857483, "reward_std": 0.059306785464286804, "rewards/gemini_judge_reward_func/mean": 0.1082589253783226, "rewards/gemini_judge_reward_func/std": 0.20976413786411285, "rewards/semantic_correctness_reward_func/mean": 0.45834270119667053, "rewards/semantic_correctness_reward_func/std": 0.1977355182170868, "rewards/xmlcount_reward_func/mean": 0.7406473159790039, "rewards/xmlcount_reward_func/std": 0.4352959096431732, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 152.18304443359375, "completions/mean_terminated_length": 144.32882690429688, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.2506081686654432, "grad_norm": 0.020522359758615494, "kl": 0.018199682235717773, "learning_rate": 2.061073738537635e-06, "loss": -0.0267, "num_tokens": 261609671.0, "reward": 0.4286612272262573, "reward_std": 0.07006536424160004, "rewards/gemini_judge_reward_func/mean": 0.1138392835855484, "rewards/gemini_judge_reward_func/std": 0.238850936293602, "rewards/semantic_correctness_reward_func/mean": 0.4498775899410248, "rewards/semantic_correctness_reward_func/std": 0.1819140762090683, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 163.95982360839844, "completions/mean_terminated_length": 144.32418823242188, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.2509495966881482, "grad_norm": 0.021382635459303856, "kl": 0.01749277114868164, "learning_rate": 2.0477129809103147e-06, "loss": -0.0024, "num_tokens": 261982182.0, "reward": 0.45341190695762634, "reward_std": 0.07530924677848816, "rewards/gemini_judge_reward_func/mean": 0.15625, "rewards/gemini_judge_reward_func/std": 0.30342772603034973, "rewards/semantic_correctness_reward_func/mean": 0.4799344837665558, "rewards/semantic_correctness_reward_func/std": 0.231959268450737, "rewards/xmlcount_reward_func/mean": 0.7373126149177551, "rewards/xmlcount_reward_func/std": 0.4393693506717682, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 154.74107360839844, "completions/mean_terminated_length": 142.94117736816406, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.25129102471085313, "grad_norm": 0.0198200773447752, "kl": 0.012925148010253906, "learning_rate": 2.034384509020837e-06, "loss": -0.0073, "num_tokens": 262327636.0, "reward": 0.43286120891571045, "reward_std": 0.05032704025506973, "rewards/gemini_judge_reward_func/mean": 0.1049107164144516, "rewards/gemini_judge_reward_func/std": 0.19847624003887177, "rewards/semantic_correctness_reward_func/mean": 0.4172344207763672, "rewards/semantic_correctness_reward_func/std": 0.1993558406829834, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 156.1607208251953, "completions/mean_terminated_length": 144.38009643554688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.2516324527335581, "grad_norm": 0.021754244342446327, "kl": 0.020738601684570312, "learning_rate": 2.021088468627237e-06, "loss": -0.0388, "num_tokens": 262679504.0, "reward": 0.4577825963497162, "reward_std": 0.04928554967045784, "rewards/gemini_judge_reward_func/mean": 0.0982142835855484, "rewards/gemini_judge_reward_func/std": 0.21270504593849182, "rewards/semantic_correctness_reward_func/mean": 0.41223424673080444, "rewards/semantic_correctness_reward_func/std": 0.1733391135931015, "rewards/xmlcount_reward_func/mean": 0.8401250839233398, "rewards/xmlcount_reward_func/std": 0.3684578835964203, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 153.9241180419922, "completions/mean_terminated_length": 138.1045379638672, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.2519738807562631, "grad_norm": 0.020852848887443542, "kl": 0.01461172103881836, "learning_rate": 2.0078250051328783e-06, "loss": -0.02, "num_tokens": 263058583.0, "reward": 0.39713457226753235, "reward_std": 0.06134895235300064, "rewards/gemini_judge_reward_func/mean": 0.1283482164144516, "rewards/gemini_judge_reward_func/std": 0.26361992955207825, "rewards/semantic_correctness_reward_func/mean": 0.4419762194156647, "rewards/semantic_correctness_reward_func/std": 0.21511034667491913, "rewards/xmlcount_reward_func/mean": 0.643500030040741, "rewards/xmlcount_reward_func/std": 0.48071080446243286, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 155.4419708251953, "completions/mean_terminated_length": 135.6118621826172, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.252315308778968, "grad_norm": 0.02940811775624752, "kl": 0.023842573165893555, "learning_rate": 1.9945942635848745e-06, "loss": -0.041, "num_tokens": 263423142.0, "reward": 0.4132702052593231, "reward_std": 0.07020176947116852, "rewards/gemini_judge_reward_func/mean": 0.125, "rewards/gemini_judge_reward_func/std": 0.2494388222694397, "rewards/semantic_correctness_reward_func/mean": 0.45785093307495117, "rewards/semantic_correctness_reward_func/std": 0.20504869520664215, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 160.79019165039062, "completions/mean_terminated_length": 149.07240295410156, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.252656736801673, "grad_norm": 0.020677493885159492, "kl": 0.015691041946411133, "learning_rate": 1.981396388672496e-06, "loss": -0.009, "num_tokens": 263798431.0, "reward": 0.3558245003223419, "reward_std": 0.059479519724845886, "rewards/gemini_judge_reward_func/mean": 0.0982142835855484, "rewards/gemini_judge_reward_func/std": 0.1990930736064911, "rewards/semantic_correctness_reward_func/mean": 0.43595272302627563, "rewards/semantic_correctness_reward_func/std": 0.1965608447790146, "rewards/xmlcount_reward_func/mean": 0.5733705759048462, "rewards/xmlcount_reward_func/std": 0.49531227350234985, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 157.1741180419922, "completions/mean_terminated_length": 145.40724182128906, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.252998164824378, "grad_norm": 0.020069163292646408, "kl": 0.015597820281982422, "learning_rate": 1.9682315247255897e-06, "loss": 0.0015, "num_tokens": 264141958.0, "reward": 0.46845346689224243, "reward_std": 0.07669265568256378, "rewards/gemini_judge_reward_func/mean": 0.1428571492433548, "rewards/gemini_judge_reward_func/std": 0.23369301855564117, "rewards/semantic_correctness_reward_func/mean": 0.44457077980041504, "rewards/semantic_correctness_reward_func/std": 0.20611417293548584, "rewards/xmlcount_reward_func/mean": 0.8059911727905273, "rewards/xmlcount_reward_func/std": 0.3960340917110443, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 150.99107360839844, "completions/mean_terminated_length": 150.99107360839844, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.2533395928470829, "grad_norm": 0.02022167295217514, "kl": 0.015549659729003906, "learning_rate": 1.9550998157129946e-06, "loss": -0.0146, "num_tokens": 264484752.0, "reward": 0.4745713174343109, "reward_std": 0.06252977252006531, "rewards/gemini_judge_reward_func/mean": 0.1473214328289032, "rewards/gemini_judge_reward_func/std": 0.2861793637275696, "rewards/semantic_correctness_reward_func/mean": 0.45428499579429626, "rewards/semantic_correctness_reward_func/std": 0.24253596365451813, "rewards/xmlcount_reward_func/mean": 0.8119643330574036, "rewards/xmlcount_reward_func/std": 0.38716888427734375, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 162.91964721679688, "completions/mean_terminated_length": 147.2636260986328, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.2536810208697879, "grad_norm": 0.02243378572165966, "kl": 0.016266345977783203, "learning_rate": 1.9420014052409793e-06, "loss": -0.0132, "num_tokens": 264851810.0, "reward": 0.4235195517539978, "reward_std": 0.07452236860990524, "rewards/gemini_judge_reward_func/mean": 0.1395089328289032, "rewards/gemini_judge_reward_func/std": 0.2478865385055542, "rewards/semantic_correctness_reward_func/mean": 0.480079710483551, "rewards/semantic_correctness_reward_func/std": 0.20092709362506866, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 160.6875, "completions/mean_terminated_length": 152.909912109375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.25402244889249287, "grad_norm": 0.020303336903452873, "kl": 0.013786792755126953, "learning_rate": 1.928936436551661e-06, "loss": -0.0067, "num_tokens": 265200628.0, "reward": 0.46757015585899353, "reward_std": 0.05975889414548874, "rewards/gemini_judge_reward_func/mean": 0.1149553582072258, "rewards/gemini_judge_reward_func/std": 0.22932343184947968, "rewards/semantic_correctness_reward_func/mean": 0.42321667075157166, "rewards/semantic_correctness_reward_func/std": 0.20813030004501343, "rewards/xmlcount_reward_func/mean": 0.8423617482185364, "rewards/xmlcount_reward_func/std": 0.3648380935192108, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 161.35714721679688, "completions/mean_terminated_length": 141.66209411621094, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.2543638769151978, "grad_norm": 0.02081400714814663, "kl": 0.016047000885009766, "learning_rate": 1.915905052521445e-06, "loss": -0.0215, "num_tokens": 265583028.0, "reward": 0.38103312253952026, "reward_std": 0.05963090807199478, "rewards/gemini_judge_reward_func/mean": 0.1183035746216774, "rewards/gemini_judge_reward_func/std": 0.2482219785451889, "rewards/semantic_correctness_reward_func/mean": 0.4097905158996582, "rewards/semantic_correctness_reward_func/std": 0.22186532616615295, "rewards/xmlcount_reward_func/mean": 0.6293839812278748, "rewards/xmlcount_reward_func/std": 0.4824991822242737, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 168.90179443359375, "completions/mean_terminated_length": 149.3789825439453, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.2547053049379028, "grad_norm": 0.020080897957086563, "kl": 0.017374038696289062, "learning_rate": 1.9029073956594607e-06, "loss": -0.0236, "num_tokens": 265954738.0, "reward": 0.43171700835227966, "reward_std": 0.07286177575588226, "rewards/gemini_judge_reward_func/mean": 0.1517857164144516, "rewards/gemini_judge_reward_func/std": 0.29065632820129395, "rewards/semantic_correctness_reward_func/mean": 0.4607636034488678, "rewards/semantic_correctness_reward_func/std": 0.23796068131923676, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903594970703, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 152.83482360839844, "completions/mean_terminated_length": 148.92825317382812, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.25504673296060776, "grad_norm": 0.021075395867228508, "kl": 0.01665353775024414, "learning_rate": 1.8899436081059974e-06, "loss": 0.0303, "num_tokens": 266302517.0, "reward": 0.48689210414886475, "reward_std": 0.07886835187673569, "rewards/gemini_judge_reward_func/mean": 0.1964285671710968, "rewards/gemini_judge_reward_func/std": 0.3121755123138428, "rewards/semantic_correctness_reward_func/mean": 0.5043531060218811, "rewards/semantic_correctness_reward_func/std": 0.22672364115715027, "rewards/xmlcount_reward_func/mean": 0.7686249613761902, "rewards/xmlcount_reward_func/std": 0.42356839776039124, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 158.3482208251953, "completions/mean_terminated_length": 134.52293395996094, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2553881609833127, "grad_norm": 0.023336829617619514, "kl": 0.02243947982788086, "learning_rate": 1.877013831630961e-06, "loss": -0.0408, "num_tokens": 266703011.0, "reward": 0.35559192299842834, "reward_std": 0.06644192337989807, "rewards/gemini_judge_reward_func/mean": 0.1261160671710968, "rewards/gemini_judge_reward_func/std": 0.2571813464164734, "rewards/semantic_correctness_reward_func/mean": 0.4174772799015045, "rewards/semantic_correctness_reward_func/std": 0.20215776562690735, "rewards/xmlcount_reward_func/mean": 0.5541250109672546, "rewards/xmlcount_reward_func/std": 0.4987334609031677, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 174.8928680419922, "completions/mean_terminated_length": 155.5068359375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.25572958900601767, "grad_norm": 0.01892857439815998, "kl": 0.014866113662719727, "learning_rate": 1.864118207632315e-06, "loss": -0.0094, "num_tokens": 267037287.0, "reward": 0.4351115822792053, "reward_std": 0.062350302934646606, "rewards/gemini_judge_reward_func/mean": 0.1082589253783226, "rewards/gemini_judge_reward_func/std": 0.19014061987400055, "rewards/semantic_correctness_reward_func/mean": 0.4575398862361908, "rewards/semantic_correctness_reward_func/std": 0.19340014457702637, "rewards/xmlcount_reward_func/mean": 0.7507500648498535, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 154.46875, "completions/mean_terminated_length": 150.56951904296875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.25607101702872265, "grad_norm": 0.019694700837135315, "kl": 0.013186931610107422, "learning_rate": 1.851256877134538e-06, "loss": -0.0324, "num_tokens": 267383892.0, "reward": 0.4643140137195587, "reward_std": 0.06906662881374359, "rewards/gemini_judge_reward_func/mean": 0.1450892835855484, "rewards/gemini_judge_reward_func/std": 0.27434879541397095, "rewards/semantic_correctness_reward_func/mean": 0.4583911895751953, "rewards/semantic_correctness_reward_func/std": 0.20246011018753052, "rewards/xmlcount_reward_func/mean": 0.786500096321106, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0401785714285714, "completions/max_length": 1024.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 189.77679443359375, "completions/mean_terminated_length": 154.85581970214844, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.2564124450514276, "grad_norm": 0.018297072499990463, "kl": 0.013269424438476562, "learning_rate": 1.838429980787081e-06, "loss": -0.0136, "num_tokens": 267744046.0, "reward": 0.4284234344959259, "reward_std": 0.06550651043653488, "rewards/gemini_judge_reward_func/mean": 0.1283482164144516, "rewards/gemini_judge_reward_func/std": 0.23435330390930176, "rewards/semantic_correctness_reward_func/mean": 0.4554206430912018, "rewards/semantic_correctness_reward_func/std": 0.21077513694763184, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 171.38839721679688, "completions/mean_terminated_length": 147.92201232910156, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.25675387307413255, "grad_norm": 0.019334284588694572, "kl": 0.013591766357421875, "learning_rate": 1.825637658862824e-06, "loss": 0.0009, "num_tokens": 268074497.0, "reward": 0.42137858271598816, "reward_std": 0.05194444581866264, "rewards/gemini_judge_reward_func/mean": 0.0948660746216774, "rewards/gemini_judge_reward_func/std": 0.20001789927482605, "rewards/semantic_correctness_reward_func/mean": 0.41566064953804016, "rewards/semantic_correctness_reward_func/std": 0.2019502967596054, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 160.01339721679688, "completions/mean_terminated_length": 152.229736328125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.25709530109683754, "grad_norm": 0.02137918211519718, "kl": 0.021810531616210938, "learning_rate": 1.8128800512565514e-06, "loss": -0.0168, "num_tokens": 268442316.0, "reward": 0.4417394697666168, "reward_std": 0.061608508229255676, "rewards/gemini_judge_reward_func/mean": 0.1183035746216774, "rewards/gemini_judge_reward_func/std": 0.20899026095867157, "rewards/semantic_correctness_reward_func/mean": 0.43484005331993103, "rewards/semantic_correctness_reward_func/std": 0.19988852739334106, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 155.6875, "completions/mean_terminated_length": 151.79373168945312, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.25743672911954246, "grad_norm": 0.020462283864617348, "kl": 0.015665769577026367, "learning_rate": 1.8001572974834169e-06, "loss": 0.0127, "num_tokens": 268793298.0, "reward": 0.4836767017841339, "reward_std": 0.06072762608528137, "rewards/gemini_judge_reward_func/mean": 0.1752232164144516, "rewards/gemini_judge_reward_func/std": 0.27353349328041077, "rewards/semantic_correctness_reward_func/mean": 0.4589102268218994, "rewards/semantic_correctness_reward_func/std": 0.1992826759815216, "rewards/xmlcount_reward_func/mean": 0.8045134544372559, "rewards/xmlcount_reward_func/std": 0.395001083612442, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 173.93304443359375, "completions/mean_terminated_length": 146.5115203857422, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.25777815714224744, "grad_norm": 0.020110012963414192, "kl": 0.014056921005249023, "learning_rate": 1.7874695366774191e-06, "loss": 0.0103, "num_tokens": 269164239.0, "reward": 0.4110858738422394, "reward_std": 0.0682491660118103, "rewards/gemini_judge_reward_func/mean": 0.1372767835855484, "rewards/gemini_judge_reward_func/std": 0.27684351801872253, "rewards/semantic_correctness_reward_func/mean": 0.44917917251586914, "rewards/semantic_correctness_reward_func/std": 0.22507894039154053, "rewards/xmlcount_reward_func/mean": 0.6658482551574707, "rewards/xmlcount_reward_func/std": 0.4710778594017029, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 155.49554443359375, "completions/mean_terminated_length": 151.6009063720703, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.2581195851649524, "grad_norm": 0.01925048790872097, "kl": 0.014804601669311523, "learning_rate": 1.774816907589873e-06, "loss": -0.0002, "num_tokens": 269511374.0, "reward": 0.4355492889881134, "reward_std": 0.061426255851984024, "rewards/gemini_judge_reward_func/mean": 0.1305803507566452, "rewards/gemini_judge_reward_func/std": 0.249376118183136, "rewards/semantic_correctness_reward_func/mean": 0.4709513783454895, "rewards/semantic_correctness_reward_func/std": 0.2258910834789276, "rewards/xmlcount_reward_func/mean": 0.7228170037269592, "rewards/xmlcount_reward_func/std": 0.4484759569168091, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 162.03125, "completions/mean_terminated_length": 154.26576232910156, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.25846101318765735, "grad_norm": 0.020978957414627075, "kl": 0.014691352844238281, "learning_rate": 1.7621995485879062e-06, "loss": -0.0172, "num_tokens": 269879709.0, "reward": 0.417973130941391, "reward_std": 0.06662485003471375, "rewards/gemini_judge_reward_func/mean": 0.1104910746216774, "rewards/gemini_judge_reward_func/std": 0.2327246069908142, "rewards/semantic_correctness_reward_func/mean": 0.4434188902378082, "rewards/semantic_correctness_reward_func/std": 0.18930912017822266, "rewards/xmlcount_reward_func/mean": 0.712732195854187, "rewards/xmlcount_reward_func/std": 0.4530121684074402, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 157.69644165039062, "completions/mean_terminated_length": 149.8918914794922, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.25880244121036233, "grad_norm": 0.020836396142840385, "kl": 0.012416601181030273, "learning_rate": 1.749617597652934e-06, "loss": 0.0155, "num_tokens": 270216073.0, "reward": 0.45066124200820923, "reward_std": 0.06927520036697388, "rewards/gemini_judge_reward_func/mean": 0.109375, "rewards/gemini_judge_reward_func/std": 0.22280631959438324, "rewards/semantic_correctness_reward_func/mean": 0.4258059859275818, "rewards/semantic_correctness_reward_func/std": 0.1996365785598755, "rewards/xmlcount_reward_func/mean": 0.8043750524520874, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 159.52679443359375, "completions/mean_terminated_length": 147.79185485839844, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.2591438692330673, "grad_norm": 0.02067898027598858, "kl": 0.013218402862548828, "learning_rate": 1.7370711923791567e-06, "loss": -0.0102, "num_tokens": 270580307.0, "reward": 0.4589332044124603, "reward_std": 0.06333411484956741, "rewards/gemini_judge_reward_func/mean": 0.1484375, "rewards/gemini_judge_reward_func/std": 0.2761194407939911, "rewards/semantic_correctness_reward_func/mean": 0.4605408012866974, "rewards/semantic_correctness_reward_func/std": 0.22162102162837982, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 180.91964721679688, "completions/mean_terminated_length": 153.72349548339844, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.25948529725577224, "grad_norm": 0.0202178992331028, "kl": 0.013530254364013672, "learning_rate": 1.7245604699720536e-06, "loss": -0.025, "num_tokens": 270939521.0, "reward": 0.42256343364715576, "reward_std": 0.06891320645809174, "rewards/gemini_judge_reward_func/mean": 0.1696428507566452, "rewards/gemini_judge_reward_func/std": 0.29326191544532776, "rewards/semantic_correctness_reward_func/mean": 0.438263475894928, "rewards/semantic_correctness_reward_func/std": 0.20034128427505493, "rewards/xmlcount_reward_func/mean": 0.6676340103149414, "rewards/xmlcount_reward_func/std": 0.4686497449874878, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 157.61607360839844, "completions/mean_terminated_length": 149.81082153320312, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.2598267252784772, "grad_norm": 0.019846944138407707, "kl": 0.01799154281616211, "learning_rate": 1.7120855672468779e-06, "loss": 0.0041, "num_tokens": 271284047.0, "reward": 0.4936152398586273, "reward_std": 0.06837694346904755, "rewards/gemini_judge_reward_func/mean": 0.1997767835855484, "rewards/gemini_judge_reward_func/std": 0.29091235995292664, "rewards/semantic_correctness_reward_func/mean": 0.4955224394798279, "rewards/semantic_correctness_reward_func/std": 0.2096569538116455, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 162.7544708251953, "completions/mean_terminated_length": 151.0633544921875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.2601681533011822, "grad_norm": 0.020178409293293953, "kl": 0.01193547248840332, "learning_rate": 1.6996466206271679e-06, "loss": 0.0086, "num_tokens": 271631348.0, "reward": 0.47532185912132263, "reward_std": 0.07062742859125137, "rewards/gemini_judge_reward_func/mean": 0.1238839253783226, "rewards/gemini_judge_reward_func/std": 0.2460424154996872, "rewards/semantic_correctness_reward_func/mean": 0.47319841384887695, "rewards/semantic_correctness_reward_func/std": 0.21414029598236084, "rewards/xmlcount_reward_func/mean": 0.8278214335441589, "rewards/xmlcount_reward_func/std": 0.3753868639469147, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 170.85269165039062, "completions/mean_terminated_length": 147.37155151367188, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.2605095813238872, "grad_norm": 0.01988663524389267, "kl": 0.013807296752929688, "learning_rate": 1.6872437661432518e-06, "loss": -0.036, "num_tokens": 271988783.0, "reward": 0.43647101521492004, "reward_std": 0.059993524104356766, "rewards/gemini_judge_reward_func/mean": 0.1127232164144516, "rewards/gemini_judge_reward_func/std": 0.23524853587150574, "rewards/semantic_correctness_reward_func/mean": 0.46438151597976685, "rewards/semantic_correctness_reward_func/std": 0.2084624022245407, "rewards/xmlcount_reward_func/mean": 0.7462633848190308, "rewards/xmlcount_reward_func/std": 0.4369613230228424, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 157.19644165039062, "completions/mean_terminated_length": 157.19644165039062, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.2608510093465921, "grad_norm": 0.01950724609196186, "kl": 0.012348175048828125, "learning_rate": 1.6748771394307584e-06, "loss": -0.0017, "num_tokens": 272317423.0, "reward": 0.43308770656585693, "reward_std": 0.04825283959507942, "rewards/gemini_judge_reward_func/mean": 0.0970982164144516, "rewards/gemini_judge_reward_func/std": 0.24214671552181244, "rewards/semantic_correctness_reward_func/mean": 0.3988582193851471, "rewards/semantic_correctness_reward_func/std": 0.19523859024047852, "rewards/xmlcount_reward_func/mean": 0.7861920595169067, "rewards/xmlcount_reward_func/std": 0.4122726619243622, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 164.9732208251953, "completions/mean_terminated_length": 149.3545379638672, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.2611924373692971, "grad_norm": 0.020313076674938202, "kl": 0.01384592056274414, "learning_rate": 1.6625468757291379e-06, "loss": -0.0154, "num_tokens": 272666877.0, "reward": 0.4314127266407013, "reward_std": 0.06205175817012787, "rewards/gemini_judge_reward_func/mean": 0.1015625, "rewards/gemini_judge_reward_func/std": 0.18942859768867493, "rewards/semantic_correctness_reward_func/mean": 0.41668835282325745, "rewards/semantic_correctness_reward_func/std": 0.1859598010778427, "rewards/xmlcount_reward_func/mean": 0.7686249613761902, "rewards/xmlcount_reward_func/std": 0.42356839776039124, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 171.37054443359375, "completions/mean_terminated_length": 143.86636352539062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.2615338653920021, "grad_norm": 0.020971033722162247, "kl": 0.013844966888427734, "learning_rate": 1.6502531098801756e-06, "loss": -0.019, "num_tokens": 273015940.0, "reward": 0.42292889952659607, "reward_std": 0.0625869482755661, "rewards/gemini_judge_reward_func/mean": 0.1116071417927742, "rewards/gemini_judge_reward_func/std": 0.22295227646827698, "rewards/semantic_correctness_reward_func/mean": 0.4241444170475006, "rewards/semantic_correctness_reward_func/std": 0.1952347457408905, "rewards/xmlcount_reward_func/mean": 0.7336428761482239, "rewards/xmlcount_reward_func/std": 0.49561816453933716, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 163.9419708251953, "completions/mean_terminated_length": 152.2669677734375, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.261875293414707, "grad_norm": 0.02085995301604271, "kl": 0.013644695281982422, "learning_rate": 1.6379959763265268e-06, "loss": -0.0253, "num_tokens": 273385231.0, "reward": 0.40597787499427795, "reward_std": 0.05839291214942932, "rewards/gemini_judge_reward_func/mean": 0.1272321492433548, "rewards/gemini_judge_reward_func/std": 0.24942879378795624, "rewards/semantic_correctness_reward_func/mean": 0.4526750147342682, "rewards/semantic_correctness_reward_func/std": 0.18940961360931396, "rewards/xmlcount_reward_func/mean": 0.6613750457763672, "rewards/xmlcount_reward_func/std": 0.47500187158584595, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 164.46429443359375, "completions/mean_terminated_length": 152.79638671875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.262216721437412, "grad_norm": 0.019793834537267685, "kl": 0.012224435806274414, "learning_rate": 1.62577560911024e-06, "loss": -0.0217, "num_tokens": 273730931.0, "reward": 0.46663984656333923, "reward_std": 0.07231976836919785, "rewards/gemini_judge_reward_func/mean": 0.1383928507566452, "rewards/gemini_judge_reward_func/std": 0.23873566091060638, "rewards/semantic_correctness_reward_func/mean": 0.43424367904663086, "rewards/semantic_correctness_reward_func/std": 0.20122456550598145, "rewards/xmlcount_reward_func/mean": 0.8110848665237427, "rewards/xmlcount_reward_func/std": 0.3890477418899536, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 166.2053680419922, "completions/mean_terminated_length": 142.59632873535156, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.26255814946011696, "grad_norm": 0.019900793209671974, "kl": 0.012137651443481445, "learning_rate": 1.6135921418712959e-06, "loss": -0.0266, "num_tokens": 274109337.0, "reward": 0.4262751340866089, "reward_std": 0.06663599610328674, "rewards/gemini_judge_reward_func/mean": 0.1283482164144516, "rewards/gemini_judge_reward_func/std": 0.2549728751182556, "rewards/semantic_correctness_reward_func/mean": 0.4357326328754425, "rewards/semantic_correctness_reward_func/std": 0.221801295876503, "rewards/xmlcount_reward_func/mean": 0.7194732427597046, "rewards/xmlcount_reward_func/std": 0.44856733083724976, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 165.85269165039062, "completions/mean_terminated_length": 150.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.2628995774828219, "grad_norm": 0.020068790763616562, "kl": 0.011019706726074219, "learning_rate": 1.6014457078461354e-06, "loss": -0.004, "num_tokens": 274433136.0, "reward": 0.49018624424934387, "reward_std": 0.05191074684262276, "rewards/gemini_judge_reward_func/mean": 0.1395089328289032, "rewards/gemini_judge_reward_func/std": 0.25894618034362793, "rewards/semantic_correctness_reward_func/mean": 0.420163094997406, "rewards/semantic_correctness_reward_func/std": 0.21702919900417328, "rewards/xmlcount_reward_func/mean": 0.8758750557899475, "rewards/xmlcount_reward_func/std": 0.33179107308387756, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 151.52232360839844, "completions/mean_terminated_length": 139.67874145507812, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.26324100550552687, "grad_norm": 0.02097266912460327, "kl": 0.02287602424621582, "learning_rate": 1.5893364398662175e-06, "loss": -0.0015, "num_tokens": 274817569.0, "reward": 0.4012994170188904, "reward_std": 0.06106601655483246, "rewards/gemini_judge_reward_func/mean": 0.1160714253783226, "rewards/gemini_judge_reward_func/std": 0.25483787059783936, "rewards/semantic_correctness_reward_func/mean": 0.40730950236320496, "rewards/semantic_correctness_reward_func/std": 0.2029609978199005, "rewards/xmlcount_reward_func/mean": 0.6835223436355591, "rewards/xmlcount_reward_func/std": 0.4653799831867218, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 162.83929443359375, "completions/mean_terminated_length": 151.14932250976562, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.26358243352823185, "grad_norm": 0.02049219235777855, "kl": 0.015248298645019531, "learning_rate": 1.5772644703565564e-06, "loss": -0.0374, "num_tokens": 275173829.0, "reward": 0.45400094985961914, "reward_std": 0.07043319195508957, "rewards/gemini_judge_reward_func/mean": 0.1328125, "rewards/gemini_judge_reward_func/std": 0.23898446559906006, "rewards/semantic_correctness_reward_func/mean": 0.4313795268535614, "rewards/semantic_correctness_reward_func/std": 0.1887551099061966, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 173.20089721679688, "completions/mean_terminated_length": 157.7318115234375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.2639238615509368, "grad_norm": 0.020059634000062943, "kl": 0.013273954391479492, "learning_rate": 1.5652299313342772e-06, "loss": -0.0095, "num_tokens": 275523618.0, "reward": 0.4348951280117035, "reward_std": 0.06464679539203644, "rewards/gemini_judge_reward_func/mean": 0.1305803507566452, "rewards/gemini_judge_reward_func/std": 0.2571229338645935, "rewards/semantic_correctness_reward_func/mean": 0.4475648105144501, "rewards/semantic_correctness_reward_func/std": 0.2125941663980484, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 163.45089721679688, "completions/mean_terminated_length": 155.6981964111328, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.26426528957364176, "grad_norm": 0.020751817151904106, "kl": 0.01684427261352539, "learning_rate": 1.5532329544071712e-06, "loss": 0.0093, "num_tokens": 275892547.0, "reward": 0.41509416699409485, "reward_std": 0.05522005259990692, "rewards/gemini_judge_reward_func/mean": 0.109375, "rewards/gemini_judge_reward_func/std": 0.22406068444252014, "rewards/semantic_correctness_reward_func/mean": 0.4133010506629944, "rewards/semantic_correctness_reward_func/std": 0.20829689502716064, "rewards/xmlcount_reward_func/mean": 0.7217098474502563, "rewards/xmlcount_reward_func/std": 0.4462066888809204, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 147.7678680419922, "completions/mean_terminated_length": 143.8385772705078, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.26460671759634674, "grad_norm": 0.018952973186969757, "kl": 0.013130664825439453, "learning_rate": 1.5412736707722537e-06, "loss": -0.0097, "num_tokens": 276235711.0, "reward": 0.4852498769760132, "reward_std": 0.053085166960954666, "rewards/gemini_judge_reward_func/mean": 0.1919642835855484, "rewards/gemini_judge_reward_func/std": 0.2860393822193146, "rewards/semantic_correctness_reward_func/mean": 0.48837438225746155, "rewards/semantic_correctness_reward_func/std": 0.2230820506811142, "rewards/xmlcount_reward_func/mean": 0.7769732475280762, "rewards/xmlcount_reward_func/std": 0.4148280918598175, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 182.04019165039062, "completions/mean_terminated_length": 158.86695861816406, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.26494814561905167, "grad_norm": 0.02046089619398117, "kl": 0.0116729736328125, "learning_rate": 1.5293522112143371e-06, "loss": -0.0221, "num_tokens": 276577924.0, "reward": 0.40735191106796265, "reward_std": 0.0653359517455101, "rewards/gemini_judge_reward_func/mean": 0.1171875, "rewards/gemini_judge_reward_func/std": 0.21555359661579132, "rewards/semantic_correctness_reward_func/mean": 0.4438844621181488, "rewards/semantic_correctness_reward_func/std": 0.2117227166891098, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 163.1294708251953, "completions/mean_terminated_length": 147.47726440429688, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.26528957364175665, "grad_norm": 0.019794687628746033, "kl": 0.01212763786315918, "learning_rate": 1.517468706104589e-06, "loss": 0.0108, "num_tokens": 276940561.0, "reward": 0.42630788683891296, "reward_std": 0.061067983508110046, "rewards/gemini_judge_reward_func/mean": 0.1004464253783226, "rewards/gemini_judge_reward_func/std": 0.20355534553527832, "rewards/semantic_correctness_reward_func/mean": 0.4291464686393738, "rewards/semantic_correctness_reward_func/std": 0.20234155654907227, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 177.58929443359375, "completions/mean_terminated_length": 166.09954833984375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.26563100166446163, "grad_norm": 0.021563053131103516, "kl": 0.020466327667236328, "learning_rate": 1.505623285399121e-06, "loss": -0.0163, "num_tokens": 277263169.0, "reward": 0.45694440603256226, "reward_std": 0.06528093665838242, "rewards/gemini_judge_reward_func/mean": 0.1060267835855484, "rewards/gemini_judge_reward_func/std": 0.23360466957092285, "rewards/semantic_correctness_reward_func/mean": 0.3925253450870514, "rewards/semantic_correctness_reward_func/std": 0.21922528743743896, "rewards/xmlcount_reward_func/mean": 0.8400714993476868, "rewards/xmlcount_reward_func/std": 0.3684346675872803, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044642857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 186.77232360839844, "completions/mean_terminated_length": 147.64952087402344, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.26597242968716656, "grad_norm": 0.020064886659383774, "kl": 0.015275955200195312, "learning_rate": 1.4938160786375571e-06, "loss": -0.02, "num_tokens": 277623346.0, "reward": 0.44759249687194824, "reward_std": 0.07056890428066254, "rewards/gemini_judge_reward_func/mean": 0.1595982164144516, "rewards/gemini_judge_reward_func/std": 0.2799893319606781, "rewards/semantic_correctness_reward_func/mean": 0.4585872292518616, "rewards/semantic_correctness_reward_func/std": 0.2106294482946396, "rewards/xmlcount_reward_func/mean": 0.7300893068313599, "rewards/xmlcount_reward_func/std": 0.43690332770347595, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 153.2678680419922, "completions/mean_terminated_length": 145.42343139648438, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.26631385770987154, "grad_norm": 0.020851830020546913, "kl": 0.015101909637451172, "learning_rate": 1.4820472149416153e-06, "loss": 0.0062, "num_tokens": 277983922.0, "reward": 0.42797625064849854, "reward_std": 0.06007370352745056, "rewards/gemini_judge_reward_func/mean": 0.1651785671710968, "rewards/gemini_judge_reward_func/std": 0.2594698965549469, "rewards/semantic_correctness_reward_func/mean": 0.47787216305732727, "rewards/semantic_correctness_reward_func/std": 0.1910025030374527, "rewards/xmlcount_reward_func/mean": 0.6658259034156799, "rewards/xmlcount_reward_func/std": 0.4710857570171356, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 156.3794708251953, "completions/mean_terminated_length": 140.6045379638672, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.2666552857325765, "grad_norm": 0.02058962918817997, "kl": 0.014112472534179688, "learning_rate": 1.4703168230137072e-06, "loss": -0.01, "num_tokens": 278356979.0, "reward": 0.39631539583206177, "reward_std": 0.06510846316814423, "rewards/gemini_judge_reward_func/mean": 0.109375, "rewards/gemini_judge_reward_func/std": 0.21381975710391998, "rewards/semantic_correctness_reward_func/mean": 0.4400768578052521, "rewards/semantic_correctness_reward_func/std": 0.1941695511341095, "rewards/xmlcount_reward_func/mean": 0.6613750457763672, "rewards/xmlcount_reward_func/std": 0.47500187158584595, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 152.14732360839844, "completions/mean_terminated_length": 144.2928009033203, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.26699671375528145, "grad_norm": 0.02044842764735222, "kl": 0.013594627380371094, "learning_rate": 1.4586250311355132e-06, "loss": -0.0371, "num_tokens": 278714588.0, "reward": 0.44066599011421204, "reward_std": 0.055519502609968185, "rewards/gemini_judge_reward_func/mean": 0.1294642835855484, "rewards/gemini_judge_reward_func/std": 0.23433461785316467, "rewards/semantic_correctness_reward_func/mean": 0.4429011344909668, "rewards/semantic_correctness_reward_func/std": 0.19725266098976135, "rewards/xmlcount_reward_func/mean": 0.7507500648498535, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 166.03125, "completions/mean_terminated_length": 146.4429168701172, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.2673381417779864, "grad_norm": 0.0210841353982687, "kl": 0.019263029098510742, "learning_rate": 1.4469719671666043e-06, "loss": -0.0098, "num_tokens": 279071023.0, "reward": 0.4465175271034241, "reward_std": 0.0766262337565422, "rewards/gemini_judge_reward_func/mean": 0.140625, "rewards/gemini_judge_reward_func/std": 0.24668484926223755, "rewards/semantic_correctness_reward_func/mean": 0.44983741641044617, "rewards/semantic_correctness_reward_func/std": 0.2102912813425064, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 149.70089721679688, "completions/mean_terminated_length": 149.70089721679688, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.2676795698006914, "grad_norm": 0.020219266414642334, "kl": 0.012418746948242188, "learning_rate": 1.4353577585430152e-06, "loss": -0.0121, "num_tokens": 279425912.0, "reward": 0.46571969985961914, "reward_std": 0.0656966120004654, "rewards/gemini_judge_reward_func/mean": 0.1138392835855484, "rewards/gemini_judge_reward_func/std": 0.20197565853595734, "rewards/semantic_correctness_reward_func/mean": 0.443071573972702, "rewards/semantic_correctness_reward_func/std": 0.1867143213748932, "rewards/xmlcount_reward_func/mean": 0.8289241790771484, "rewards/xmlcount_reward_func/std": 0.3740423023700714, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 746.0, "completions/mean_length": 163.125, "completions/mean_terminated_length": 159.26458740234375, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.26802099782339633, "grad_norm": 0.023210877552628517, "kl": 0.014898300170898438, "learning_rate": 1.4237825322758735e-06, "loss": -0.0172, "num_tokens": 279769636.0, "reward": 0.4273902475833893, "reward_std": 0.04811634495854378, "rewards/gemini_judge_reward_func/mean": 0.0747767835855484, "rewards/gemini_judge_reward_func/std": 0.157900869846344, "rewards/semantic_correctness_reward_func/mean": 0.4377457797527313, "rewards/semantic_correctness_reward_func/std": 0.17589552700519562, "rewards/xmlcount_reward_func/mean": 0.774825930595398, "rewards/xmlcount_reward_func/std": 0.41559502482414246, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 167.7991180419922, "completions/mean_terminated_length": 156.17648315429688, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.2683624258461013, "grad_norm": 0.022142644971609116, "kl": 0.01622152328491211, "learning_rate": 1.412246414949997e-06, "loss": -0.0116, "num_tokens": 280144391.0, "reward": 0.41873544454574585, "reward_std": 0.05372651666402817, "rewards/gemini_judge_reward_func/mean": 0.1171875, "rewards/gemini_judge_reward_func/std": 0.22571587562561035, "rewards/semantic_correctness_reward_func/mean": 0.44717708230018616, "rewards/semantic_correctness_reward_func/std": 0.21336087584495544, "rewards/xmlcount_reward_func/mean": 0.7060624957084656, "rewards/xmlcount_reward_func/std": 0.4573599696159363, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044642857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 190.55357360839844, "completions/mean_terminated_length": 151.6074676513672, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.2687038538688063, "grad_norm": 0.027027791365981102, "kl": 0.01961684226989746, "learning_rate": 1.4007495327225162e-06, "loss": 0.0185, "num_tokens": 280532283.0, "reward": 0.40803104639053345, "reward_std": 0.07797716557979584, "rewards/gemini_judge_reward_func/mean": 0.1261160671710968, "rewards/gemini_judge_reward_func/std": 0.23675422370433807, "rewards/semantic_correctness_reward_func/mean": 0.42274436354637146, "rewards/semantic_correctness_reward_func/std": 0.20996291935443878, "rewards/xmlcount_reward_func/mean": 0.6825892329216003, "rewards/xmlcount_reward_func/std": 0.46256276965141296, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 165.3616180419922, "completions/mean_terminated_length": 149.75, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.2690452818915112, "grad_norm": 0.01970202848315239, "kl": 0.015131473541259766, "learning_rate": 1.389292011321498e-06, "loss": -0.026, "num_tokens": 280909580.0, "reward": 0.42601242661476135, "reward_std": 0.07185456156730652, "rewards/gemini_judge_reward_func/mean": 0.1395089328289032, "rewards/gemini_judge_reward_func/std": 0.26323994994163513, "rewards/semantic_correctness_reward_func/mean": 0.45679420232772827, "rewards/semantic_correctness_reward_func/std": 0.212924987077713, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903892993927, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 161.85714721679688, "completions/mean_terminated_length": 146.1818084716797, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.2693867099142162, "grad_norm": 0.022267106920480728, "kl": 0.013372421264648438, "learning_rate": 1.3778739760445552e-06, "loss": -0.0111, "num_tokens": 281254012.0, "reward": 0.4726380705833435, "reward_std": 0.0705428421497345, "rewards/gemini_judge_reward_func/mean": 0.1607142835855484, "rewards/gemini_judge_reward_func/std": 0.27581340074539185, "rewards/semantic_correctness_reward_func/mean": 0.4330117106437683, "rewards/semantic_correctness_reward_func/std": 0.22527293860912323, "rewards/xmlcount_reward_func/mean": 0.8043751120567322, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 157.98214721679688, "completions/mean_terminated_length": 146.22625732421875, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.2697281379369212, "grad_norm": 0.021097326651215553, "kl": 0.015774250030517578, "learning_rate": 1.3664955517574967e-06, "loss": -0.0176, "num_tokens": 281637792.0, "reward": 0.4128992259502411, "reward_std": 0.06964334845542908, "rewards/gemini_judge_reward_func/mean": 0.1495535671710968, "rewards/gemini_judge_reward_func/std": 0.26249030232429504, "rewards/semantic_correctness_reward_func/mean": 0.4426388740539551, "rewards/semantic_correctness_reward_func/std": 0.21492232382297516, "rewards/xmlcount_reward_func/mean": 0.6613750457763672, "rewards/xmlcount_reward_func/std": 0.47500187158584595, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 168.2366180419922, "completions/mean_terminated_length": 160.5270233154297, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.2700695659596261, "grad_norm": 0.018702613189816475, "kl": 0.011590957641601562, "learning_rate": 1.3551568628929434e-06, "loss": 0.0018, "num_tokens": 281995737.0, "reward": 0.4427972733974457, "reward_std": 0.05938927084207535, "rewards/gemini_judge_reward_func/mean": 0.0870535746216774, "rewards/gemini_judge_reward_func/std": 0.16811345517635345, "rewards/semantic_correctness_reward_func/mean": 0.42219167947769165, "rewards/semantic_correctness_reward_func/std": 0.18814805150032043, "rewards/xmlcount_reward_func/mean": 0.8088437914848328, "rewards/xmlcount_reward_func/std": 0.3951219618320465, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 167.6875, "completions/mean_terminated_length": 144.1192626953125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.2704109939823311, "grad_norm": 0.020110948011279106, "kl": 0.013335943222045898, "learning_rate": 1.343858033448982e-06, "loss": -0.037, "num_tokens": 282359479.0, "reward": 0.4621303975582123, "reward_std": 0.06999564170837402, "rewards/gemini_judge_reward_func/mean": 0.1517857164144516, "rewards/gemini_judge_reward_func/std": 0.27984848618507385, "rewards/semantic_correctness_reward_func/mean": 0.4340803325176239, "rewards/semantic_correctness_reward_func/std": 0.2148793637752533, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 144.5803680419922, "completions/mean_terminated_length": 144.5803680419922, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.2707524220050361, "grad_norm": 0.020432932302355766, "kl": 0.01767253875732422, "learning_rate": 1.3325991869878013e-06, "loss": 0.0123, "num_tokens": 282724829.0, "reward": 0.4063700735569, "reward_std": 0.06738902628421783, "rewards/gemini_judge_reward_func/mean": 0.1149553582072258, "rewards/gemini_judge_reward_func/std": 0.24007098376750946, "rewards/semantic_correctness_reward_func/mean": 0.40768957138061523, "rewards/semantic_correctness_reward_func/std": 0.21261626482009888, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903892993927, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 149.48214721679688, "completions/mean_terminated_length": 145.560546875, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.271093850027741, "grad_norm": 0.02059169113636017, "kl": 0.016257762908935547, "learning_rate": 1.321380446634342e-06, "loss": -0.0034, "num_tokens": 283067757.0, "reward": 0.42399582266807556, "reward_std": 0.06734077632427216, "rewards/gemini_judge_reward_func/mean": 0.1060267835855484, "rewards/gemini_judge_reward_func/std": 0.235991969704628, "rewards/semantic_correctness_reward_func/mean": 0.41760390996932983, "rewards/semantic_correctness_reward_func/std": 0.21390259265899658, "rewards/xmlcount_reward_func/mean": 0.7451607584953308, "rewards/xmlcount_reward_func/std": 0.4366372525691986, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0401785714285714, "completions/max_length": 1024.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 183.84376525878906, "completions/mean_terminated_length": 148.67442321777344, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.271435278050446, "grad_norm": 0.018847793340682983, "kl": 0.012856006622314453, "learning_rate": 1.3102019350749528e-06, "loss": -0.0197, "num_tokens": 283461510.0, "reward": 0.3889876902103424, "reward_std": 0.06528311222791672, "rewards/gemini_judge_reward_func/mean": 0.1305803507566452, "rewards/gemini_judge_reward_func/std": 0.2516138255596161, "rewards/semantic_correctness_reward_func/mean": 0.4201793968677521, "rewards/semantic_correctness_reward_func/std": 0.1820724755525589, "rewards/xmlcount_reward_func/mean": 0.6317991614341736, "rewards/xmlcount_reward_func/std": 0.48062554001808167, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 156.80804443359375, "completions/mean_terminated_length": 148.99549865722656, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.27177670607315096, "grad_norm": 0.020621461793780327, "kl": 0.012531042098999023, "learning_rate": 1.2990637745560418e-06, "loss": 0.0117, "num_tokens": 283817627.0, "reward": 0.46992671489715576, "reward_std": 0.08015181124210358, "rewards/gemini_judge_reward_func/mean": 0.1517857164144516, "rewards/gemini_judge_reward_func/std": 0.27068495750427246, "rewards/semantic_correctness_reward_func/mean": 0.455187052488327, "rewards/semantic_correctness_reward_func/std": 0.20298680663108826, "rewards/xmlcount_reward_func/mean": 0.7954375147819519, "rewards/xmlcount_reward_func/std": 0.405271977186203, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 148.75894165039062, "completions/mean_terminated_length": 144.83409118652344, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2721181340958559, "grad_norm": 0.020870037376880646, "kl": 0.014215469360351562, "learning_rate": 1.2879660868827508e-06, "loss": -0.03, "num_tokens": 284171409.0, "reward": 0.40534916520118713, "reward_std": 0.051454123109579086, "rewards/gemini_judge_reward_func/mean": 0.1104910746216774, "rewards/gemini_judge_reward_func/std": 0.23151718080043793, "rewards/semantic_correctness_reward_func/mean": 0.41151338815689087, "rewards/semantic_correctness_reward_func/std": 0.21736524999141693, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903892993927, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 172.7991180419922, "completions/mean_terminated_length": 145.3410186767578, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.27245956211856087, "grad_norm": 0.021433832123875618, "kl": 0.014792680740356445, "learning_rate": 1.2769089934176126e-06, "loss": 0.0023, "num_tokens": 284538724.0, "reward": 0.4125317931175232, "reward_std": 0.06591948866844177, "rewards/gemini_judge_reward_func/mean": 0.1216517835855484, "rewards/gemini_judge_reward_func/std": 0.23435331881046295, "rewards/semantic_correctness_reward_func/mean": 0.4251053035259247, "rewards/semantic_correctness_reward_func/std": 0.19413845241069794, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903594970703, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 165.05804443359375, "completions/mean_terminated_length": 153.398193359375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.27280099014126585, "grad_norm": 0.02007019706070423, "kl": 0.0131683349609375, "learning_rate": 1.2658926150792321e-06, "loss": 0.0247, "num_tokens": 284890389.0, "reward": 0.43971216678619385, "reward_std": 0.06779628247022629, "rewards/gemini_judge_reward_func/mean": 0.1238839253783226, "rewards/gemini_judge_reward_func/std": 0.2615041136741638, "rewards/semantic_correctness_reward_func/mean": 0.4046053886413574, "rewards/semantic_correctness_reward_func/std": 0.21147848665714264, "rewards/xmlcount_reward_func/mean": 0.7730938196182251, "rewards/xmlcount_reward_func/std": 0.4206935167312622, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 174.90626525878906, "completions/mean_terminated_length": 155.52053833007812, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.27314241816397083, "grad_norm": 0.018950890749692917, "kl": 0.012112617492675781, "learning_rate": 1.2549170723409548e-06, "loss": 0.031, "num_tokens": 285250060.0, "reward": 0.44825732707977295, "reward_std": 0.07844427973031998, "rewards/gemini_judge_reward_func/mean": 0.1272321492433548, "rewards/gemini_judge_reward_func/std": 0.21174997091293335, "rewards/semantic_correctness_reward_func/mean": 0.43173304200172424, "rewards/semantic_correctness_reward_func/std": 0.20918342471122742, "rewards/xmlcount_reward_func/mean": 0.7775446772575378, "rewards/xmlcount_reward_func/std": 0.4613456428050995, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 174.5178680419922, "completions/mean_terminated_length": 147.11520385742188, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.27348384618667576, "grad_norm": 0.01994583196938038, "kl": 0.01393747329711914, "learning_rate": 1.243982485229559e-06, "loss": -0.0056, "num_tokens": 285638904.0, "reward": 0.42606374621391296, "reward_std": 0.06385260820388794, "rewards/gemini_judge_reward_func/mean": 0.1104910746216774, "rewards/gemini_judge_reward_func/std": 0.2512555420398712, "rewards/semantic_correctness_reward_func/mean": 0.41007763147354126, "rewards/semantic_correctness_reward_func/std": 0.207870215177536, "rewards/xmlcount_reward_func/mean": 0.749629557132721, "rewards/xmlcount_reward_func/std": 0.434091717004776, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 159.65179443359375, "completions/mean_terminated_length": 147.91856384277344, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.27382527420938074, "grad_norm": 0.019207848235964775, "kl": 0.013546228408813477, "learning_rate": 1.233088973323937e-06, "loss": -0.026, "num_tokens": 285992778.0, "reward": 0.45018380880355835, "reward_std": 0.06126423552632332, "rewards/gemini_judge_reward_func/mean": 0.1227678582072258, "rewards/gemini_judge_reward_func/std": 0.24027155339717865, "rewards/semantic_correctness_reward_func/mean": 0.46813318133354187, "rewards/semantic_correctness_reward_func/std": 0.20166510343551636, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 179.36607360839844, "completions/mean_terminated_length": 152.11981201171875, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.2741667022320857, "grad_norm": 0.019164903089404106, "kl": 0.011428117752075195, "learning_rate": 1.2222366557537911e-06, "loss": -0.0085, "num_tokens": 286324600.0, "reward": 0.4557950496673584, "reward_std": 0.06462844461202621, "rewards/gemini_judge_reward_func/mean": 0.1506696492433548, "rewards/gemini_judge_reward_func/std": 0.27591997385025024, "rewards/semantic_correctness_reward_func/mean": 0.4761357307434082, "rewards/semantic_correctness_reward_func/std": 0.2194724828004837, "rewards/xmlcount_reward_func/mean": 0.7507500648498535, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 163.40179443359375, "completions/mean_terminated_length": 159.5426025390625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.27450813025479065, "grad_norm": 0.01984548196196556, "kl": 0.014154434204101562, "learning_rate": 1.2114256511983274e-06, "loss": 0.0074, "num_tokens": 286673866.0, "reward": 0.46019554138183594, "reward_std": 0.07883590459823608, "rewards/gemini_judge_reward_func/mean": 0.1529017835855484, "rewards/gemini_judge_reward_func/std": 0.2797389626502991, "rewards/semantic_correctness_reward_func/mean": 0.45792388916015625, "rewards/semantic_correctness_reward_func/std": 0.2234930694103241, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356839776039124, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 183.72769165039062, "completions/mean_terminated_length": 156.6221160888672, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.27484955827749563, "grad_norm": 0.019319184124469757, "kl": 0.013891935348510742, "learning_rate": 1.200656077884958e-06, "loss": 0.0075, "num_tokens": 287028617.0, "reward": 0.4133339822292328, "reward_std": 0.0628940612077713, "rewards/gemini_judge_reward_func/mean": 0.1216517835855484, "rewards/gemini_judge_reward_func/std": 0.2571618854999542, "rewards/semantic_correctness_reward_func/mean": 0.45144638419151306, "rewards/semantic_correctness_reward_func/std": 0.19957341253757477, "rewards/xmlcount_reward_func/mean": 0.6859598159790039, "rewards/xmlcount_reward_func/std": 0.46228134632110596, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 145.25894165039062, "completions/mean_terminated_length": 141.31838989257812, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.2751909863002006, "grad_norm": 0.020531706511974335, "kl": 0.013467550277709961, "learning_rate": 1.189928053588012e-06, "loss": 0.0089, "num_tokens": 287368943.0, "reward": 0.4259699881076813, "reward_std": 0.056370336562395096, "rewards/gemini_judge_reward_func/mean": 0.1160714253783226, "rewards/gemini_judge_reward_func/std": 0.23540008068084717, "rewards/semantic_correctness_reward_func/mean": 0.3827962875366211, "rewards/semantic_correctness_reward_func/std": 0.19811107218265533, "rewards/xmlcount_reward_func/mean": 0.7574554085731506, "rewards/xmlcount_reward_func/std": 0.4265342950820923, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 168.04019165039062, "completions/mean_terminated_length": 160.32882690429688, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.27553241432290554, "grad_norm": 0.020059313625097275, "kl": 0.012961864471435547, "learning_rate": 1.1792416956274443e-06, "loss": -0.0077, "num_tokens": 287728964.0, "reward": 0.4849930703639984, "reward_std": 0.08212324976921082, "rewards/gemini_judge_reward_func/mean": 0.1640625, "rewards/gemini_judge_reward_func/std": 0.27331385016441345, "rewards/semantic_correctness_reward_func/mean": 0.4470454156398773, "rewards/semantic_correctness_reward_func/std": 0.22025921940803528, "rewards/xmlcount_reward_func/mean": 0.8248973488807678, "rewards/xmlcount_reward_func/std": 0.3805646598339081, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 168.46429443359375, "completions/mean_terminated_length": 144.91741943359375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2758738423456105, "grad_norm": 0.020307661965489388, "kl": 0.01799488067626953, "learning_rate": 1.1685971208675539e-06, "loss": 0.0114, "num_tokens": 288099604.0, "reward": 0.4186796247959137, "reward_std": 0.0665237084031105, "rewards/gemini_judge_reward_func/mean": 0.1629464328289032, "rewards/gemini_judge_reward_func/std": 0.30907392501831055, "rewards/semantic_correctness_reward_func/mean": 0.46446940302848816, "rewards/semantic_correctness_reward_func/std": 0.24958083033561707, "rewards/xmlcount_reward_func/mean": 0.6515178680419922, "rewards/xmlcount_reward_func/std": 0.4737243354320526, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 156.15625, "completions/mean_terminated_length": 144.3755645751953, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.2762152703683155, "grad_norm": 0.020918505266308784, "kl": 0.014392375946044922, "learning_rate": 1.157994445715706e-06, "loss": 0.0052, "num_tokens": 288450511.0, "reward": 0.42883527278900146, "reward_std": 0.07986614853143692, "rewards/gemini_judge_reward_func/mean": 0.1305803507566452, "rewards/gemini_judge_reward_func/std": 0.235503688454628, "rewards/semantic_correctness_reward_func/mean": 0.45301559567451477, "rewards/semantic_correctness_reward_func/std": 0.2121279239654541, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 154.13394165039062, "completions/mean_terminated_length": 150.23318481445312, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.27655669839102043, "grad_norm": 0.0214835312217474, "kl": 0.012638568878173828, "learning_rate": 1.1474337861210543e-06, "loss": -0.03, "num_tokens": 288758333.0, "reward": 0.4833209812641144, "reward_std": 0.06771397590637207, "rewards/gemini_judge_reward_func/mean": 0.1651785671710968, "rewards/gemini_judge_reward_func/std": 0.277225136756897, "rewards/semantic_correctness_reward_func/mean": 0.450685054063797, "rewards/semantic_correctness_reward_func/std": 0.23370078206062317, "rewards/xmlcount_reward_func/mean": 0.8177813291549683, "rewards/xmlcount_reward_func/std": 0.3879494369029999, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 149.8928680419922, "completions/mean_terminated_length": 142.0180206298828, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.2768981264137254, "grad_norm": 0.020677506923675537, "kl": 0.018234729766845703, "learning_rate": 1.1369152575732823e-06, "loss": -0.0149, "num_tokens": 289114269.0, "reward": 0.4415244460105896, "reward_std": 0.07626640051603317, "rewards/gemini_judge_reward_func/mean": 0.1651785671710968, "rewards/gemini_judge_reward_func/std": 0.26901575922966003, "rewards/semantic_correctness_reward_func/mean": 0.4562382698059082, "rewards/semantic_correctness_reward_func/std": 0.20611201226711273, "rewards/xmlcount_reward_func/mean": 0.7105134725570679, "rewards/xmlcount_reward_func/std": 0.45530465245246887, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 163.88394165039062, "completions/mean_terminated_length": 144.24656677246094, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.2772395544364304, "grad_norm": 0.02087881974875927, "kl": 0.014058828353881836, "learning_rate": 1.1264389751013326e-06, "loss": -0.0299, "num_tokens": 289461251.0, "reward": 0.4769635498523712, "reward_std": 0.0741821750998497, "rewards/gemini_judge_reward_func/mean": 0.1696428507566452, "rewards/gemini_judge_reward_func/std": 0.2775498926639557, "rewards/semantic_correctness_reward_func/mean": 0.472531795501709, "rewards/semantic_correctness_reward_func/std": 0.21735930442810059, "rewards/xmlcount_reward_func/mean": 0.7864999771118164, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 165.35269165039062, "completions/mean_terminated_length": 161.5022430419922, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.2775809824591353, "grad_norm": 0.02140544354915619, "kl": 0.015886783599853516, "learning_rate": 1.1160050532721527e-06, "loss": 0.0098, "num_tokens": 289816638.0, "reward": 0.437183141708374, "reward_std": 0.06502541899681091, "rewards/gemini_judge_reward_func/mean": 0.1328125, "rewards/gemini_judge_reward_func/std": 0.2581525146961212, "rewards/semantic_correctness_reward_func/mean": 0.4504155218601227, "rewards/semantic_correctness_reward_func/std": 0.20866143703460693, "rewards/xmlcount_reward_func/mean": 0.7349375486373901, "rewards/xmlcount_reward_func/std": 0.4362303912639618, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 174.36607360839844, "completions/mean_terminated_length": 158.91818237304688, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.2779224104818403, "grad_norm": 0.019738927483558655, "kl": 0.014048099517822266, "learning_rate": 1.1056136061894386e-06, "loss": -0.0082, "num_tokens": 290180528.0, "reward": 0.41765350103378296, "reward_std": 0.04742603749036789, "rewards/gemini_judge_reward_func/mean": 0.0970982164144516, "rewards/gemini_judge_reward_func/std": 0.21256086230278015, "rewards/semantic_correctness_reward_func/mean": 0.4283210337162018, "rewards/semantic_correctness_reward_func/std": 0.20828229188919067, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 158.6875, "completions/mean_terminated_length": 142.9545440673828, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.2782638385045453, "grad_norm": 0.021320754662156105, "kl": 0.019411802291870117, "learning_rate": 1.095264747492391e-06, "loss": -0.0259, "num_tokens": 290586834.0, "reward": 0.38686153292655945, "reward_std": 0.07450807094573975, "rewards/gemini_judge_reward_func/mean": 0.1540178507566452, "rewards/gemini_judge_reward_func/std": 0.2533310353755951, "rewards/semantic_correctness_reward_func/mean": 0.4465217590332031, "rewards/semantic_correctness_reward_func/std": 0.1979120969772339, "rewards/xmlcount_reward_func/mean": 0.5898750424385071, "rewards/xmlcount_reward_func/std": 0.4935583770275116, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 1024.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 187.71429443359375, "completions/mean_terminated_length": 156.74073791503906, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.2786052665272502, "grad_norm": 0.01968975178897381, "kl": 0.01842355728149414, "learning_rate": 1.0849585903544707e-06, "loss": 0.0035, "num_tokens": 290962134.0, "reward": 0.3895324170589447, "reward_std": 0.06561978906393051, "rewards/gemini_judge_reward_func/mean": 0.1261160671710968, "rewards/gemini_judge_reward_func/std": 0.23437467217445374, "rewards/semantic_correctness_reward_func/mean": 0.4061976969242096, "rewards/semantic_correctness_reward_func/std": 0.18816907703876495, "rewards/xmlcount_reward_func/mean": 0.6446161270141602, "rewards/xmlcount_reward_func/std": 0.4794987738132477, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 168.25, "completions/mean_terminated_length": 148.7123260498047, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.2789466945499552, "grad_norm": 0.02080092765390873, "kl": 0.013473033905029297, "learning_rate": 1.0746952474821615e-06, "loss": -0.0304, "num_tokens": 291327398.0, "reward": 0.43678271770477295, "reward_std": 0.07601462304592133, "rewards/gemini_judge_reward_func/mean": 0.1473214328289032, "rewards/gemini_judge_reward_func/std": 0.25067970156669617, "rewards/semantic_correctness_reward_func/mean": 0.4235205352306366, "rewards/semantic_correctness_reward_func/std": 0.24099822342395782, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 169.82144165039062, "completions/mean_terminated_length": 158.22625732421875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.27928812257266017, "grad_norm": 0.020462460815906525, "kl": 0.014742612838745117, "learning_rate": 1.0644748311137377e-06, "loss": -0.0084, "num_tokens": 291660130.0, "reward": 0.46902474761009216, "reward_std": 0.06322862952947617, "rewards/gemini_judge_reward_func/mean": 0.1729910671710968, "rewards/gemini_judge_reward_func/std": 0.29747891426086426, "rewards/semantic_correctness_reward_func/mean": 0.4574181139469147, "rewards/semantic_correctness_reward_func/std": 0.2231023907661438, "rewards/xmlcount_reward_func/mean": 0.7708616852760315, "rewards/xmlcount_reward_func/std": 0.42080527544021606, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 170.60269165039062, "completions/mean_terminated_length": 155.08636474609375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.2796295505953651, "grad_norm": 0.01898844726383686, "kl": 0.011361122131347656, "learning_rate": 1.0542974530180327e-06, "loss": -0.0136, "num_tokens": 291994705.0, "reward": 0.45434191823005676, "reward_std": 0.07022686302661896, "rewards/gemini_judge_reward_func/mean": 0.1573660671710968, "rewards/gemini_judge_reward_func/std": 0.2812555730342865, "rewards/semantic_correctness_reward_func/mean": 0.45547717809677124, "rewards/semantic_correctness_reward_func/std": 0.22370651364326477, "rewards/xmlcount_reward_func/mean": 0.7507500648498535, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 162.15625, "completions/mean_terminated_length": 154.3918914794922, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.2799709786180701, "grad_norm": 0.01947224698960781, "kl": 0.011575698852539062, "learning_rate": 1.0441632244932238e-06, "loss": -0.0028, "num_tokens": 292289820.0, "reward": 0.4589068293571472, "reward_std": 0.05952470749616623, "rewards/gemini_judge_reward_func/mean": 0.09375, "rewards/gemini_judge_reward_func/std": 0.2160203754901886, "rewards/semantic_correctness_reward_func/mean": 0.3910340368747711, "rewards/semantic_correctness_reward_func/std": 0.21046021580696106, "rewards/xmlcount_reward_func/mean": 0.8579999804496765, "rewards/xmlcount_reward_func/std": 0.35106155276298523, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 156.95982360839844, "completions/mean_terminated_length": 137.1643829345703, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.28031240664077506, "grad_norm": 0.02017991617321968, "kl": 0.01740121841430664, "learning_rate": 1.0340722563656109e-06, "loss": -0.0061, "num_tokens": 292632279.0, "reward": 0.4119788706302643, "reward_std": 0.05955352261662483, "rewards/gemini_judge_reward_func/mean": 0.1149553582072258, "rewards/gemini_judge_reward_func/std": 0.24354811012744904, "rewards/semantic_correctness_reward_func/mean": 0.4357335567474365, "rewards/semantic_correctness_reward_func/std": 0.20957152545452118, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903892993927, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 153.30357360839844, "completions/mean_terminated_length": 145.45945739746094, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.28065383466348, "grad_norm": 0.020473873242735863, "kl": 0.013295650482177734, "learning_rate": 1.0240246589884046e-06, "loss": -0.0091, "num_tokens": 292982703.0, "reward": 0.44188037514686584, "reward_std": 0.04769199714064598, "rewards/gemini_judge_reward_func/mean": 0.1171875, "rewards/gemini_judge_reward_func/std": 0.22941070795059204, "rewards/semantic_correctness_reward_func/mean": 0.4377768635749817, "rewards/semantic_correctness_reward_func/std": 0.21695400774478912, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 164.54464721679688, "completions/mean_terminated_length": 148.91818237304688, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.28099526268618497, "grad_norm": 0.019360825419425964, "kl": 0.012386083602905273, "learning_rate": 1.0140205422405213e-06, "loss": 0.0227, "num_tokens": 293337929.0, "reward": 0.4652153551578522, "reward_std": 0.07463736832141876, "rewards/gemini_judge_reward_func/mean": 0.1953125, "rewards/gemini_judge_reward_func/std": 0.31242993474006653, "rewards/semantic_correctness_reward_func/mean": 0.46970170736312866, "rewards/semantic_correctness_reward_func/std": 0.24221506714820862, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 144.9866180419922, "completions/mean_terminated_length": 144.9866180419922, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.28133669070888995, "grad_norm": 0.021534979343414307, "kl": 0.013988494873046875, "learning_rate": 1.0040600155253766e-06, "loss": 0.0079, "num_tokens": 293665650.0, "reward": 0.4723190665245056, "reward_std": 0.07654394209384918, "rewards/gemini_judge_reward_func/mean": 0.1651785671710968, "rewards/gemini_judge_reward_func/std": 0.27519577741622925, "rewards/semantic_correctness_reward_func/mean": 0.45823797583580017, "rewards/semantic_correctness_reward_func/std": 0.22768716514110565, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 957.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 161.45089721679688, "completions/mean_terminated_length": 161.45089721679688, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.2816781187315949, "grad_norm": 0.01974865049123764, "kl": 0.013636112213134766, "learning_rate": 9.941431877696955e-07, "loss": -0.0092, "num_tokens": 293982823.0, "reward": 0.44613802433013916, "reward_std": 0.0767451748251915, "rewards/gemini_judge_reward_func/mean": 0.1160714253783226, "rewards/gemini_judge_reward_func/std": 0.23058846592903137, "rewards/semantic_correctness_reward_func/mean": 0.4361810088157654, "rewards/semantic_correctness_reward_func/std": 0.19925597310066223, "rewards/xmlcount_reward_func/mean": 0.7811830639839172, "rewards/xmlcount_reward_func/std": 0.41245442628860474, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 171.91964721679688, "completions/mean_terminated_length": 148.46788024902344, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.28201954675429985, "grad_norm": 0.019685156643390656, "kl": 0.013269186019897461, "learning_rate": 9.842701674223187e-07, "loss": -0.0294, "num_tokens": 294355533.0, "reward": 0.41046252846717834, "reward_std": 0.05996851623058319, "rewards/gemini_judge_reward_func/mean": 0.1171875, "rewards/gemini_judge_reward_func/std": 0.2354399412870407, "rewards/semantic_correctness_reward_func/mean": 0.41921430826187134, "rewards/semantic_correctness_reward_func/std": 0.21170702576637268, "rewards/xmlcount_reward_func/mean": 0.6993616819381714, "rewards/xmlcount_reward_func/std": 0.4591045379638672, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 154.86607360839844, "completions/mean_terminated_length": 147.03604125976562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.28236097477700484, "grad_norm": 0.020932121202349663, "kl": 0.015183448791503906, "learning_rate": 9.744410624530148e-07, "loss": -0.0218, "num_tokens": 294719891.0, "reward": 0.4267961084842682, "reward_std": 0.07028646767139435, "rewards/gemini_judge_reward_func/mean": 0.140625, "rewards/gemini_judge_reward_func/std": 0.2523016333580017, "rewards/semantic_correctness_reward_func/mean": 0.45848026871681213, "rewards/semantic_correctness_reward_func/std": 0.20311670005321503, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903594970703, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 168.8928680419922, "completions/mean_terminated_length": 145.3577880859375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.28270240279970976, "grad_norm": 0.020107265561819077, "kl": 0.013638973236083984, "learning_rate": 9.646559803512995e-07, "loss": 0.0013, "num_tokens": 295080995.0, "reward": 0.44543105363845825, "reward_std": 0.06815174221992493, "rewards/gemini_judge_reward_func/mean": 0.1428571492433548, "rewards/gemini_judge_reward_func/std": 0.2476668804883957, "rewards/semantic_correctness_reward_func/mean": 0.43994078040122986, "rewards/semantic_correctness_reward_func/std": 0.21600624918937683, "rewards/xmlcount_reward_func/mean": 0.7507500648498535, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 169.49107360839844, "completions/mean_terminated_length": 153.9545440673828, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.28304383082241474, "grad_norm": 0.019049208611249924, "kl": 0.015591621398925781, "learning_rate": 9.549150281252633e-07, "loss": 0.0164, "num_tokens": 295424597.0, "reward": 0.45486655831336975, "reward_std": 0.08084883540868759, "rewards/gemini_judge_reward_func/mean": 0.1450892835855484, "rewards/gemini_judge_reward_func/std": 0.2607302665710449, "rewards/semantic_correctness_reward_func/mean": 0.44690415263175964, "rewards/semantic_correctness_reward_func/std": 0.22606715559959412, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 173.16964721679688, "completions/mean_terminated_length": 157.6999969482422, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.2833852588451197, "grad_norm": 0.01913963072001934, "kl": 0.015146732330322266, "learning_rate": 9.452183123003999e-07, "loss": 0.0101, "num_tokens": 295787895.0, "reward": 0.45005136728286743, "reward_std": 0.06961911916732788, "rewards/gemini_judge_reward_func/mean": 0.1752232164144516, "rewards/gemini_judge_reward_func/std": 0.30639660358428955, "rewards/semantic_correctness_reward_func/mean": 0.43075647950172424, "rewards/semantic_correctness_reward_func/std": 0.2312474101781845, "rewards/xmlcount_reward_func/mean": 0.7345268130302429, "rewards/xmlcount_reward_func/std": 0.4422244727611542, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 183.26339721679688, "completions/mean_terminated_length": 156.14285278320312, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.28372668686782465, "grad_norm": 0.01923030987381935, "kl": 0.011376142501831055, "learning_rate": 9.355659389184396e-07, "loss": -0.0029, "num_tokens": 296141650.0, "reward": 0.4269668757915497, "reward_std": 0.06150417774915695, "rewards/gemini_judge_reward_func/mean": 0.1060267835855484, "rewards/gemini_judge_reward_func/std": 0.20415377616882324, "rewards/semantic_correctness_reward_func/mean": 0.4212806522846222, "rewards/semantic_correctness_reward_func/std": 0.19276918470859528, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 171.87054443359375, "completions/mean_terminated_length": 152.41551208496094, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.28406811489052963, "grad_norm": 0.020839476957917213, "kl": 0.012931585311889648, "learning_rate": 9.259580135361929e-07, "loss": 0.0077, "num_tokens": 296490013.0, "reward": 0.4934826195240021, "reward_std": 0.07880446314811707, "rewards/gemini_judge_reward_func/mean": 0.1395089328289032, "rewards/gemini_judge_reward_func/std": 0.25567853450775146, "rewards/semantic_correctness_reward_func/mean": 0.44558241963386536, "rewards/semantic_correctness_reward_func/std": 0.19386854767799377, "rewards/xmlcount_reward_func/mean": 0.8714062571525574, "rewards/xmlcount_reward_func/std": 0.3368014693260193, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 142.20089721679688, "completions/mean_terminated_length": 138.24664306640625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.2844095429132346, "grad_norm": 0.021701615303754807, "kl": 0.01783442497253418, "learning_rate": 9.163946412243896e-07, "loss": -0.0085, "num_tokens": 296849162.0, "reward": 0.4567524492740631, "reward_std": 0.08476989716291428, "rewards/gemini_judge_reward_func/mean": 0.1618303507566452, "rewards/gemini_judge_reward_func/std": 0.29147952795028687, "rewards/semantic_correctness_reward_func/mean": 0.4586014151573181, "rewards/semantic_correctness_reward_func/std": 0.21884453296661377, "rewards/xmlcount_reward_func/mean": 0.7507500648498535, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 161.02232360839844, "completions/mean_terminated_length": 153.2477569580078, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.28475097093593954, "grad_norm": 0.02000286616384983, "kl": 0.015344858169555664, "learning_rate": 9.068759265665384e-07, "loss": 0.0039, "num_tokens": 297207363.0, "reward": 0.45448118448257446, "reward_std": 0.05085495859384537, "rewards/gemini_judge_reward_func/mean": 0.0770089253783226, "rewards/gemini_judge_reward_func/std": 0.18472008407115936, "rewards/semantic_correctness_reward_func/mean": 0.4046199917793274, "rewards/semantic_correctness_reward_func/std": 0.20661711692810059, "rewards/xmlcount_reward_func/mean": 0.8568840026855469, "rewards/xmlcount_reward_func/std": 0.3510022759437561, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 164.46429443359375, "completions/mean_terminated_length": 148.83636474609375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.2850923989586445, "grad_norm": 0.020461585372686386, "kl": 0.013503074645996094, "learning_rate": 8.974019736577777e-07, "loss": -0.0198, "num_tokens": 297569715.0, "reward": 0.4008234739303589, "reward_std": 0.0664874017238617, "rewards/gemini_judge_reward_func/mean": 0.1205357164144516, "rewards/gemini_judge_reward_func/std": 0.21303418278694153, "rewards/semantic_correctness_reward_func/mean": 0.4402957856655121, "rewards/semantic_correctness_reward_func/std": 0.21150822937488556, "rewards/xmlcount_reward_func/mean": 0.6613750457763672, "rewards/xmlcount_reward_func/std": 0.47500187158584595, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 163.07589721679688, "completions/mean_terminated_length": 143.4200897216797, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.2854338269813495, "grad_norm": 0.023171184584498405, "kl": 0.014976263046264648, "learning_rate": 8.879728861037385e-07, "loss": -0.012, "num_tokens": 297925140.0, "reward": 0.3934459984302521, "reward_std": 0.07260891795158386, "rewards/gemini_judge_reward_func/mean": 0.1116071417927742, "rewards/gemini_judge_reward_func/std": 0.2242058366537094, "rewards/semantic_correctness_reward_func/mean": 0.4212654232978821, "rewards/semantic_correctness_reward_func/std": 0.19929270446300507, "rewards/xmlcount_reward_func/mean": 0.6613750457763672, "rewards/xmlcount_reward_func/std": 0.47500187158584595, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 716.0, "completions/mean_length": 174.25894165039062, "completions/mean_terminated_length": 158.80908203125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.2857752550040545, "grad_norm": 0.01915401965379715, "kl": 0.014138221740722656, "learning_rate": 8.785887670194137e-07, "loss": 0.0121, "num_tokens": 298242194.0, "reward": 0.4870355427265167, "reward_std": 0.07406332343816757, "rewards/gemini_judge_reward_func/mean": 0.1439732164144516, "rewards/gemini_judge_reward_func/std": 0.25647982954978943, "rewards/semantic_correctness_reward_func/mean": 0.4312308430671692, "rewards/semantic_correctness_reward_func/std": 0.2138284593820572, "rewards/xmlcount_reward_func/mean": 0.8580000996589661, "rewards/xmlcount_reward_func/std": 0.35106155276298523, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 149.5, "completions/mean_terminated_length": 149.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.2861166830267594, "grad_norm": 0.020606782287359238, "kl": 0.014037609100341797, "learning_rate": 8.692497190280225e-07, "loss": -0.0364, "num_tokens": 298575282.0, "reward": 0.42692282795906067, "reward_std": 0.0603804774582386, "rewards/gemini_judge_reward_func/mean": 0.0904017835855484, "rewards/gemini_judge_reward_func/std": 0.2089034467935562, "rewards/semantic_correctness_reward_func/mean": 0.3808102607727051, "rewards/semantic_correctness_reward_func/std": 0.22411176562309265, "rewards/xmlcount_reward_func/mean": 0.786500096321106, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 163.34375, "completions/mean_terminated_length": 155.590087890625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.2864581110494644, "grad_norm": 0.020060038194060326, "kl": 0.01865839958190918, "learning_rate": 8.599558442598998e-07, "loss": 0.0025, "num_tokens": 298949891.0, "reward": 0.41716110706329346, "reward_std": 0.06800390779972076, "rewards/gemini_judge_reward_func/mean": 0.1462053507566452, "rewards/gemini_judge_reward_func/std": 0.25848188996315, "rewards/semantic_correctness_reward_func/mean": 0.4352964758872986, "rewards/semantic_correctness_reward_func/std": 0.2010316401720047, "rewards/xmlcount_reward_func/mean": 0.6790491342544556, "rewards/xmlcount_reward_func/std": 0.46593108773231506, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 163.66964721679688, "completions/mean_terminated_length": 148.0272674560547, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2867995390721694, "grad_norm": 0.023369010537862778, "kl": 0.015680789947509766, "learning_rate": 8.507072443513703e-07, "loss": -0.0068, "num_tokens": 299298193.0, "reward": 0.4394191801548004, "reward_std": 0.048864465206861496, "rewards/gemini_judge_reward_func/mean": 0.1071428582072258, "rewards/gemini_judge_reward_func/std": 0.23128196597099304, "rewards/semantic_correctness_reward_func/mean": 0.4455600380897522, "rewards/semantic_correctness_reward_func/std": 0.21113541722297668, "rewards/xmlcount_reward_func/mean": 0.7686249613761902, "rewards/xmlcount_reward_func/std": 0.42356839776039124, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 164.74107360839844, "completions/mean_terminated_length": 160.8878936767578, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.2871409670948743, "grad_norm": 0.020501412451267242, "kl": 0.013971805572509766, "learning_rate": 8.415040204436426e-07, "loss": 0.0172, "num_tokens": 299640531.0, "reward": 0.44586053490638733, "reward_std": 0.059171684086322784, "rewards/gemini_judge_reward_func/mean": 0.0993303582072258, "rewards/gemini_judge_reward_func/std": 0.2047901600599289, "rewards/semantic_correctness_reward_func/mean": 0.4397847056388855, "rewards/semantic_correctness_reward_func/std": 0.20271818339824677, "rewards/xmlcount_reward_func/mean": 0.7954286336898804, "rewards/xmlcount_reward_func/std": 0.3996967077255249, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 164.39732360839844, "completions/mean_terminated_length": 156.6531524658203, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.2874823951175793, "grad_norm": 0.020911818370223045, "kl": 0.011243343353271484, "learning_rate": 8.323462731816962e-07, "loss": -0.0332, "num_tokens": 299943028.0, "reward": 0.46584782004356384, "reward_std": 0.06846728920936584, "rewards/gemini_judge_reward_func/mean": 0.1183035746216774, "rewards/gemini_judge_reward_func/std": 0.24595338106155396, "rewards/semantic_correctness_reward_func/mean": 0.41238173842430115, "rewards/semantic_correctness_reward_func/std": 0.2328694760799408, "rewards/xmlcount_reward_func/mean": 0.8401250839233398, "rewards/xmlcount_reward_func/std": 0.3684578835964203, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 163.625, "completions/mean_terminated_length": 151.94570922851562, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.28782382314028426, "grad_norm": 0.021682532504200935, "kl": 0.015304327011108398, "learning_rate": 8.232341027131885e-07, "loss": -0.0226, "num_tokens": 300254040.0, "reward": 0.46195971965789795, "reward_std": 0.049539245665073395, "rewards/gemini_judge_reward_func/mean": 0.1071428582072258, "rewards/gemini_judge_reward_func/std": 0.1915077120065689, "rewards/semantic_correctness_reward_func/mean": 0.4152628481388092, "rewards/semantic_correctness_reward_func/std": 0.20335595309734344, "rewards/xmlcount_reward_func/mean": 0.8401250839233398, "rewards/xmlcount_reward_func/std": 0.3684578835964203, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 183.6607208251953, "completions/mean_terminated_length": 160.5321044921875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.2881652511629892, "grad_norm": 0.019986923784017563, "kl": 0.01100611686706543, "learning_rate": 8.141676086873574e-07, "loss": -0.0132, "num_tokens": 300626396.0, "reward": 0.43232661485671997, "reward_std": 0.058139532804489136, "rewards/gemini_judge_reward_func/mean": 0.0982142835855484, "rewards/gemini_judge_reward_func/std": 0.19909308850765228, "rewards/semantic_correctness_reward_func/mean": 0.3922043442726135, "rewards/semantic_correctness_reward_func/std": 0.19739094376564026, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 157.40625, "completions/mean_terminated_length": 141.64999389648438, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.28850667918569417, "grad_norm": 0.02015194483101368, "kl": 0.016405105590820312, "learning_rate": 8.051468902539272e-07, "loss": -0.0271, "num_tokens": 300980835.0, "reward": 0.4009341299533844, "reward_std": 0.06801611930131912, "rewards/gemini_judge_reward_func/mean": 0.1428571492433548, "rewards/gemini_judge_reward_func/std": 0.2476668804883957, "rewards/semantic_correctness_reward_func/mean": 0.4286705553531647, "rewards/semantic_correctness_reward_func/std": 0.22438682615756989, "rewards/xmlcount_reward_func/mean": 0.6451429128646851, "rewards/xmlcount_reward_func/std": 0.4791279733181, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 166.5982208251953, "completions/mean_terminated_length": 154.95928955078125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.28884810720839915, "grad_norm": 0.01968112215399742, "kl": 0.013155221939086914, "learning_rate": 7.961720460620321e-07, "loss": -0.008, "num_tokens": 301317749.0, "reward": 0.47892674803733826, "reward_std": 0.06558345258235931, "rewards/gemini_judge_reward_func/mean": 0.1395089328289032, "rewards/gemini_judge_reward_func/std": 0.2433219850063324, "rewards/semantic_correctness_reward_func/mean": 0.47111573815345764, "rewards/semantic_correctness_reward_func/std": 0.19663625955581665, "rewards/xmlcount_reward_func/mean": 0.8222500681877136, "rewards/xmlcount_reward_func/std": 0.3842346966266632, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 177.89732360839844, "completions/mean_terminated_length": 150.6036834716797, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2891895352311041, "grad_norm": 0.020076138898730278, "kl": 0.015607357025146484, "learning_rate": 7.872431742591268e-07, "loss": -0.033, "num_tokens": 301673162.0, "reward": 0.3954106271266937, "reward_std": 0.06636636704206467, "rewards/gemini_judge_reward_func/mean": 0.1060267835855484, "rewards/gemini_judge_reward_func/std": 0.24871297180652618, "rewards/semantic_correctness_reward_func/mean": 0.4311065077781677, "rewards/semantic_correctness_reward_func/std": 0.18301746249198914, "rewards/xmlcount_reward_func/mean": 0.6669464707374573, "rewards/xmlcount_reward_func/std": 0.4685778319835663, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 161.6607208251953, "completions/mean_terminated_length": 149.9547576904297, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.28953096325380906, "grad_norm": 0.02056868001818657, "kl": 0.01586151123046875, "learning_rate": 7.783603724899258e-07, "loss": 0.0144, "num_tokens": 302045542.0, "reward": 0.44777730107307434, "reward_std": 0.06656524538993835, "rewards/gemini_judge_reward_func/mean": 0.1618303507566452, "rewards/gemini_judge_reward_func/std": 0.2631829082965851, "rewards/semantic_correctness_reward_func/mean": 0.48522552847862244, "rewards/semantic_correctness_reward_func/std": 0.22041460871696472, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 842.0, "completions/mean_length": 160.20982360839844, "completions/mean_terminated_length": 148.4841766357422, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.28987239127651404, "grad_norm": 0.020083541050553322, "kl": 0.01417398452758789, "learning_rate": 7.695237378953224e-07, "loss": 0.0159, "num_tokens": 302391601.0, "reward": 0.4260123372077942, "reward_std": 0.06106026470661163, "rewards/gemini_judge_reward_func/mean": 0.1283482164144516, "rewards/gemini_judge_reward_func/std": 0.20637255907058716, "rewards/semantic_correctness_reward_func/mean": 0.4433649182319641, "rewards/semantic_correctness_reward_func/std": 0.1907728761434555, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 172.33929443359375, "completions/mean_terminated_length": 148.89907836914062, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.29021381929921897, "grad_norm": 0.019677992910146713, "kl": 0.014250516891479492, "learning_rate": 7.607333671113409e-07, "loss": 0.0118, "num_tokens": 302743377.0, "reward": 0.4396704435348511, "reward_std": 0.06315828114748001, "rewards/gemini_judge_reward_func/mean": 0.1696428507566452, "rewards/gemini_judge_reward_func/std": 0.292304664850235, "rewards/semantic_correctness_reward_func/mean": 0.46481624245643616, "rewards/semantic_correctness_reward_func/std": 0.1968490481376648, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903594970703, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 181.64732360839844, "completions/mean_terminated_length": 158.4633026123047, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.29055524732192395, "grad_norm": 0.0194232277572155, "kl": 0.012831687927246094, "learning_rate": 7.519893562680663e-07, "loss": 0.0072, "num_tokens": 303115110.0, "reward": 0.46728041768074036, "reward_std": 0.07786200195550919, "rewards/gemini_judge_reward_func/mean": 0.1607142835855484, "rewards/gemini_judge_reward_func/std": 0.2717183828353882, "rewards/semantic_correctness_reward_func/mean": 0.43975013494491577, "rewards/semantic_correctness_reward_func/std": 0.21617595851421356, "rewards/xmlcount_reward_func/mean": 0.7876116633415222, "rewards/xmlcount_reward_func/std": 0.4077901840209961, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 158.8482208251953, "completions/mean_terminated_length": 143.11817932128906, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.29089667534462893, "grad_norm": 0.01978623867034912, "kl": 0.012966394424438477, "learning_rate": 7.432918009885997e-07, "loss": 0.0013, "num_tokens": 303451772.0, "reward": 0.4542922377586365, "reward_std": 0.0721825510263443, "rewards/gemini_judge_reward_func/mean": 0.1540178507566452, "rewards/gemini_judge_reward_func/std": 0.2816223204135895, "rewards/semantic_correctness_reward_func/mean": 0.4619251787662506, "rewards/semantic_correctness_reward_func/std": 0.22607234120368958, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 166.27679443359375, "completions/mean_terminated_length": 154.63348388671875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.29123810336733386, "grad_norm": 0.021079065278172493, "kl": 0.013202190399169922, "learning_rate": 7.346407963880137e-07, "loss": -0.0072, "num_tokens": 303776266.0, "reward": 0.45635682344436646, "reward_std": 0.06529436260461807, "rewards/gemini_judge_reward_func/mean": 0.1194196417927742, "rewards/gemini_judge_reward_func/std": 0.2169191539287567, "rewards/semantic_correctness_reward_func/mean": 0.43419477343559265, "rewards/semantic_correctness_reward_func/std": 0.19115614891052246, "rewards/xmlcount_reward_func/mean": 0.8043751120567322, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 168.6919708251953, "completions/mean_terminated_length": 149.1643829345703, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.29157953139003884, "grad_norm": 0.019034838303923607, "kl": 0.01335906982421875, "learning_rate": 7.260364370723044e-07, "loss": 0.0358, "num_tokens": 304163093.0, "reward": 0.389647513628006, "reward_std": 0.06260879337787628, "rewards/gemini_judge_reward_func/mean": 0.1171875, "rewards/gemini_judge_reward_func/std": 0.24705736339092255, "rewards/semantic_correctness_reward_func/mean": 0.4223891794681549, "rewards/semantic_correctness_reward_func/std": 0.2090659886598587, "rewards/xmlcount_reward_func/mean": 0.6457366943359375, "rewards/xmlcount_reward_func/std": 0.4788653254508972, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 160.6294708251953, "completions/mean_terminated_length": 156.7578582763672, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.2919209594127438, "grad_norm": 0.020094826817512512, "kl": 0.012228012084960938, "learning_rate": 7.174788171373731e-07, "loss": -0.0181, "num_tokens": 304475666.0, "reward": 0.44959086179733276, "reward_std": 0.06874435395002365, "rewards/gemini_judge_reward_func/mean": 0.1138392835855484, "rewards/gemini_judge_reward_func/std": 0.2341102659702301, "rewards/semantic_correctness_reward_func/mean": 0.41152557730674744, "rewards/semantic_correctness_reward_func/std": 0.21029044687747955, "rewards/xmlcount_reward_func/mean": 0.8043750524520874, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 156.6294708251953, "completions/mean_terminated_length": 140.8590850830078, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.29226238743544874, "grad_norm": 0.020419996231794357, "kl": 0.01521611213684082, "learning_rate": 7.089680301679752e-07, "loss": -0.0014, "num_tokens": 304836359.0, "reward": 0.4453238248825073, "reward_std": 0.061911653727293015, "rewards/gemini_judge_reward_func/mean": 0.1607142835855484, "rewards/gemini_judge_reward_func/std": 0.2622709274291992, "rewards/semantic_correctness_reward_func/mean": 0.43944045901298523, "rewards/semantic_correctness_reward_func/std": 0.21513807773590088, "rewards/xmlcount_reward_func/mean": 0.7328750491142273, "rewards/xmlcount_reward_func/std": 0.44427841901779175, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 159.61607360839844, "completions/mean_terminated_length": 143.89999389648438, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.2926038154581537, "grad_norm": 0.028872739523649216, "kl": 0.01840972900390625, "learning_rate": 7.005041692367154e-07, "loss": 0.0281, "num_tokens": 305184333.0, "reward": 0.4945422112941742, "reward_std": 0.08059635013341904, "rewards/gemini_judge_reward_func/mean": 0.1640625, "rewards/gemini_judge_reward_func/std": 0.27535709738731384, "rewards/semantic_correctness_reward_func/mean": 0.46433597803115845, "rewards/semantic_correctness_reward_func/std": 0.21889406442642212, "rewards/xmlcount_reward_func/mean": 0.8401250839233398, "rewards/xmlcount_reward_func/std": 0.3684578537940979, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 187.21876525878906, "completions/mean_terminated_length": 156.2268524169922, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.2929452434808587, "grad_norm": 0.020401984453201294, "kl": 0.012844085693359375, "learning_rate": 6.92087326903022e-07, "loss": 0.007, "num_tokens": 305534010.0, "reward": 0.4454714357852936, "reward_std": 0.06435173749923706, "rewards/gemini_judge_reward_func/mean": 0.140625, "rewards/gemini_judge_reward_func/std": 0.25888094305992126, "rewards/semantic_correctness_reward_func/mean": 0.48035693168640137, "rewards/semantic_correctness_reward_func/std": 0.20532502233982086, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 167.4375, "completions/mean_terminated_length": 155.80996704101562, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.29328667150356363, "grad_norm": 0.019876059144735336, "kl": 0.014787673950195312, "learning_rate": 6.837175952121305e-07, "loss": -0.0128, "num_tokens": 305878924.0, "reward": 0.42637744545936584, "reward_std": 0.06756948679685593, "rewards/gemini_judge_reward_func/mean": 0.1183035746216774, "rewards/gemini_judge_reward_func/std": 0.2269900143146515, "rewards/semantic_correctness_reward_func/mean": 0.4116818606853485, "rewards/semantic_correctness_reward_func/std": 0.18583944439888, "rewards/xmlcount_reward_func/mean": 0.7417991757392883, "rewards/xmlcount_reward_func/std": 0.4394584894180298, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 783.0, "completions/mean_length": 164.16519165039062, "completions/mean_terminated_length": 148.5318145751953, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.2936280995262686, "grad_norm": 0.019963495433330536, "kl": 0.014317989349365234, "learning_rate": 6.753950656940905e-07, "loss": -0.0166, "num_tokens": 306243169.0, "reward": 0.4247058033943176, "reward_std": 0.07365789264440536, "rewards/gemini_judge_reward_func/mean": 0.1261160671710968, "rewards/gemini_judge_reward_func/std": 0.2740635871887207, "rewards/semantic_correctness_reward_func/mean": 0.41675207018852234, "rewards/semantic_correctness_reward_func/std": 0.21209114789962769, "rewards/xmlcount_reward_func/mean": 0.7272723913192749, "rewards/xmlcount_reward_func/std": 0.44621461629867554, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 152.4553680419922, "completions/mean_terminated_length": 148.54708862304688, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.2939695275489736, "grad_norm": 0.018703632056713104, "kl": 0.01490640640258789, "learning_rate": 6.671198293627479e-07, "loss": -0.0362, "num_tokens": 306606443.0, "reward": 0.4184475243091583, "reward_std": 0.07583961635828018, "rewards/gemini_judge_reward_func/mean": 0.171875, "rewards/gemini_judge_reward_func/std": 0.2710452377796173, "rewards/semantic_correctness_reward_func/mean": 0.46148738265037537, "rewards/semantic_correctness_reward_func/std": 0.20480482280254364, "rewards/xmlcount_reward_func/mean": 0.643500030040741, "rewards/xmlcount_reward_func/std": 0.48071083426475525, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 169.95982360839844, "completions/mean_terminated_length": 154.4318084716797, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.2943109555716785, "grad_norm": 0.02081831730902195, "kl": 0.013816595077514648, "learning_rate": 6.58891976714764e-07, "loss": -0.0332, "num_tokens": 306977126.0, "reward": 0.428364098072052, "reward_std": 0.06174404174089432, "rewards/gemini_judge_reward_func/mean": 0.1049107164144516, "rewards/gemini_judge_reward_func/std": 0.21344491839408875, "rewards/semantic_correctness_reward_func/mean": 0.43049880862236023, "rewards/semantic_correctness_reward_func/std": 0.19661079347133636, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 176.09376525878906, "completions/mean_terminated_length": 156.7351531982422, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.2946523835943835, "grad_norm": 0.01923784427344799, "kl": 0.014006614685058594, "learning_rate": 6.507115977286144e-07, "loss": 0.0059, "num_tokens": 307345083.0, "reward": 0.4206071197986603, "reward_std": 0.060593608766794205, "rewards/gemini_judge_reward_func/mean": 0.1149553582072258, "rewards/gemini_judge_reward_func/std": 0.23416103422641754, "rewards/semantic_correctness_reward_func/mean": 0.40737470984458923, "rewards/semantic_correctness_reward_func/std": 0.19159357249736786, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 150.45982360839844, "completions/mean_terminated_length": 138.60182189941406, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.2949938116170885, "grad_norm": 0.02324904315173626, "kl": 0.01671123504638672, "learning_rate": 6.425787818636131e-07, "loss": 0.0013, "num_tokens": 307693414.0, "reward": 0.4323778748512268, "reward_std": 0.0673980787396431, "rewards/gemini_judge_reward_func/mean": 0.1272321492433548, "rewards/gemini_judge_reward_func/std": 0.2471712976694107, "rewards/semantic_correctness_reward_func/mean": 0.4416749179363251, "rewards/semantic_correctness_reward_func/std": 0.20484818518161774, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 157.45089721679688, "completions/mean_terminated_length": 145.6877899169922, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.2953352396397934, "grad_norm": 0.02026812545955181, "kl": 0.017922401428222656, "learning_rate": 6.34493618058935e-07, "loss": -0.0241, "num_tokens": 308038595.0, "reward": 0.40504324436187744, "reward_std": 0.0582125298678875, "rewards/gemini_judge_reward_func/mean": 0.0870535746216774, "rewards/gemini_judge_reward_func/std": 0.1900254338979721, "rewards/semantic_correctness_reward_func/mean": 0.43004655838012695, "rewards/semantic_correctness_reward_func/std": 0.18637752532958984, "rewards/xmlcount_reward_func/mean": 0.7105312943458557, "rewards/xmlcount_reward_func/std": 0.4553159773349762, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 151.52232360839844, "completions/mean_terminated_length": 135.65908813476562, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.2956766676624984, "grad_norm": 0.02160787023603916, "kl": 0.016959190368652344, "learning_rate": 6.264561947326331e-07, "loss": -0.0499, "num_tokens": 308428480.0, "reward": 0.38458970189094543, "reward_std": 0.061651427298784256, "rewards/gemini_judge_reward_func/mean": 0.1495535671710968, "rewards/gemini_judge_reward_func/std": 0.2810530662536621, "rewards/semantic_correctness_reward_func/mean": 0.44409117102622986, "rewards/semantic_correctness_reward_func/std": 0.2333114594221115, "rewards/xmlcount_reward_func/mean": 0.5898750424385071, "rewards/xmlcount_reward_func/std": 0.493558406829834, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 149.66964721679688, "completions/mean_terminated_length": 137.8009033203125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2960180956852034, "grad_norm": 0.022309862077236176, "kl": 0.020073890686035156, "learning_rate": 6.184665997806832e-07, "loss": 0.0107, "num_tokens": 308800722.0, "reward": 0.41277799010276794, "reward_std": 0.0709206610918045, "rewards/gemini_judge_reward_func/mean": 0.1205357164144516, "rewards/gemini_judge_reward_func/std": 0.2195136845111847, "rewards/semantic_correctness_reward_func/mean": 0.4352737367153168, "rewards/semantic_correctness_reward_func/std": 0.21140582859516144, "rewards/xmlcount_reward_func/mean": 0.6937723755836487, "rewards/xmlcount_reward_func/std": 0.46180078387260437, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 173.76339721679688, "completions/mean_terminated_length": 158.30453491210938, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.2963595237079083, "grad_norm": 0.021554123610258102, "kl": 0.012543916702270508, "learning_rate": 6.105249205760128e-07, "loss": 0.0002, "num_tokens": 309163189.0, "reward": 0.4783555567264557, "reward_std": 0.0800277516245842, "rewards/gemini_judge_reward_func/mean": 0.1618303507566452, "rewards/gemini_judge_reward_func/std": 0.2705346941947937, "rewards/semantic_correctness_reward_func/mean": 0.45045629143714905, "rewards/semantic_correctness_reward_func/std": 0.23706389963626862, "rewards/xmlcount_reward_func/mean": 0.8088303804397583, "rewards/xmlcount_reward_func/std": 0.39227935671806335, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 166.58929443359375, "completions/mean_terminated_length": 154.95022583007812, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.2967009517306133, "grad_norm": 0.019824448972940445, "kl": 0.017318248748779297, "learning_rate": 6.026312439675553e-07, "loss": -0.015, "num_tokens": 309522953.0, "reward": 0.47017693519592285, "reward_std": 0.08199930191040039, "rewards/gemini_judge_reward_func/mean": 0.1473214328289032, "rewards/gemini_judge_reward_func/std": 0.26269039511680603, "rewards/semantic_correctness_reward_func/mean": 0.44304510951042175, "rewards/semantic_correctness_reward_func/std": 0.22514069080352783, "rewards/xmlcount_reward_func/mean": 0.8065982460975647, "rewards/xmlcount_reward_func/std": 0.3925861418247223, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 162.86607360839844, "completions/mean_terminated_length": 151.17648315429688, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.29704237975331826, "grad_norm": 0.019513197243213654, "kl": 0.016964197158813477, "learning_rate": 5.947856562792926e-07, "loss": -0.0283, "num_tokens": 309917055.0, "reward": 0.4131915271282196, "reward_std": 0.058818139135837555, "rewards/gemini_judge_reward_func/mean": 0.1160714253783226, "rewards/gemini_judge_reward_func/std": 0.23658768832683563, "rewards/semantic_correctness_reward_func/mean": 0.4277253746986389, "rewards/semantic_correctness_reward_func/std": 0.21168087422847748, "rewards/xmlcount_reward_func/mean": 0.7030447721481323, "rewards/xmlcount_reward_func/std": 0.45517581701278687, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 163.6294708251953, "completions/mean_terminated_length": 151.95022583007812, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2973838077760232, "grad_norm": 0.02505846694111824, "kl": 0.01614856719970703, "learning_rate": 5.869882433093154e-07, "loss": -0.0025, "num_tokens": 310251404.0, "reward": 0.469322144985199, "reward_std": 0.06780051440000534, "rewards/gemini_judge_reward_func/mean": 0.1674107164144516, "rewards/gemini_judge_reward_func/std": 0.28190651535987854, "rewards/semantic_correctness_reward_func/mean": 0.4745390713214874, "rewards/semantic_correctness_reward_func/std": 0.21615885198116302, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 163.66964721679688, "completions/mean_terminated_length": 144.0273895263672, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.29772523579872817, "grad_norm": 0.01972603239119053, "kl": 0.014445066452026367, "learning_rate": 5.79239090328883e-07, "loss": 0.0015, "num_tokens": 310608642.0, "reward": 0.44004756212234497, "reward_std": 0.062194082885980606, "rewards/gemini_judge_reward_func/mean": 0.1383928507566452, "rewards/gemini_judge_reward_func/std": 0.2837909758090973, "rewards/semantic_correctness_reward_func/mean": 0.45770174264907837, "rewards/semantic_correctness_reward_func/std": 0.23461376130580902, "rewards/xmlcount_reward_func/mean": 0.7328750491142273, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 151.84375, "completions/mean_terminated_length": 151.84375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.29806666382143315, "grad_norm": 0.020589305087924004, "kl": 0.012288570404052734, "learning_rate": 5.715382820814885e-07, "loss": -0.0382, "num_tokens": 310927507.0, "reward": 0.46137532591819763, "reward_std": 0.06986761838197708, "rewards/gemini_judge_reward_func/mean": 0.1082589253783226, "rewards/gemini_judge_reward_func/std": 0.21504218876361847, "rewards/semantic_correctness_reward_func/mean": 0.4101085662841797, "rewards/semantic_correctness_reward_func/std": 0.22632215917110443, "rewards/xmlcount_reward_func/mean": 0.8401250839233398, "rewards/xmlcount_reward_func/std": 0.3684578835964203, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 171.75001525878906, "completions/mean_terminated_length": 144.25807189941406, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.29840809184413813, "grad_norm": 0.018754450604319572, "kl": 0.014352798461914062, "learning_rate": 5.63885902781941e-07, "loss": -0.0132, "num_tokens": 311314675.0, "reward": 0.42426401376724243, "reward_std": 0.06676241010427475, "rewards/gemini_judge_reward_func/mean": 0.1160714253783226, "rewards/gemini_judge_reward_func/std": 0.24243849515914917, "rewards/semantic_correctness_reward_func/mean": 0.43821272253990173, "rewards/semantic_correctness_reward_func/std": 0.20891118049621582, "rewards/xmlcount_reward_func/mean": 0.725482165813446, "rewards/xmlcount_reward_func/std": 0.4461726248264313, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 825.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 151.74554443359375, "completions/mean_terminated_length": 151.74554443359375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.29874951986684306, "grad_norm": 0.021069129928946495, "kl": 0.01585555076599121, "learning_rate": 5.562820361154315e-07, "loss": 0.0016, "num_tokens": 311651818.0, "reward": 0.44300538301467896, "reward_std": 0.06342455744743347, "rewards/gemini_judge_reward_func/mean": 0.1216517835855484, "rewards/gemini_judge_reward_func/std": 0.23554620146751404, "rewards/semantic_correctness_reward_func/mean": 0.4344729781150818, "rewards/semantic_correctness_reward_func/std": 0.20860819518566132, "rewards/xmlcount_reward_func/mean": 0.7686249613761902, "rewards/xmlcount_reward_func/std": 0.42356839776039124, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 179.93304443359375, "completions/mean_terminated_length": 148.67129516601562, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.29909094788954804, "grad_norm": 0.020150672644376755, "kl": 0.014079093933105469, "learning_rate": 5.487267652366291e-07, "loss": 0.0032, "num_tokens": 312022119.0, "reward": 0.3856308162212372, "reward_std": 0.05329553782939911, "rewards/gemini_judge_reward_func/mean": 0.0948660746216774, "rewards/gemini_judge_reward_func/std": 0.20417827367782593, "rewards/semantic_correctness_reward_func/mean": 0.4156718850135803, "rewards/semantic_correctness_reward_func/std": 0.19628028571605682, "rewards/xmlcount_reward_func/mean": 0.6613750457763672, "rewards/xmlcount_reward_func/std": 0.47500187158584595, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 169.14732360839844, "completions/mean_terminated_length": 145.6192626953125, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.299432375912253, "grad_norm": 0.019851071760058403, "kl": 0.012566089630126953, "learning_rate": 5.412201727687644e-07, "loss": -0.0112, "num_tokens": 312423944.0, "reward": 0.39708369970321655, "reward_std": 0.07032705843448639, "rewards/gemini_judge_reward_func/mean": 0.140625, "rewards/gemini_judge_reward_func/std": 0.25888094305992126, "rewards/semantic_correctness_reward_func/mean": 0.4261060357093811, "rewards/semantic_correctness_reward_func/std": 0.19139282405376434, "rewards/xmlcount_reward_func/mean": 0.6390312910079956, "rewards/xmlcount_reward_func/std": 0.4820234477519989, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 915.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 147.0044708251953, "completions/mean_terminated_length": 147.0044708251953, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.29977380393495795, "grad_norm": 0.01827196218073368, "kl": 0.014067649841308594, "learning_rate": 5.337623408027293e-07, "loss": -0.0067, "num_tokens": 312785133.0, "reward": 0.4566745162010193, "reward_std": 0.07833483070135117, "rewards/gemini_judge_reward_func/mean": 0.1841517835855484, "rewards/gemini_judge_reward_func/std": 0.31206128001213074, "rewards/semantic_correctness_reward_func/mean": 0.4940064251422882, "rewards/semantic_correctness_reward_func/std": 0.22719837725162506, "rewards/xmlcount_reward_func/mean": 0.7105312943458557, "rewards/xmlcount_reward_func/std": 0.4553159773349762, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 170.375, "completions/mean_terminated_length": 154.8545379638672, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.30011523195766293, "grad_norm": 0.019717033952474594, "kl": 0.013404130935668945, "learning_rate": 5.263533508961827e-07, "loss": -0.0193, "num_tokens": 313157273.0, "reward": 0.4236012399196625, "reward_std": 0.07436075061559677, "rewards/gemini_judge_reward_func/mean": 0.1104910746216774, "rewards/gemini_judge_reward_func/std": 0.23512084782123566, "rewards/semantic_correctness_reward_func/mean": 0.4133989214897156, "rewards/semantic_correctness_reward_func/std": 0.21528121829032898, "rewards/xmlcount_reward_func/mean": 0.7418125867843628, "rewards/xmlcount_reward_func/std": 0.4394664168357849, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 157.05357360839844, "completions/mean_terminated_length": 137.26026916503906, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.3004566599803679, "grad_norm": 0.020437972620129585, "kl": 0.017246723175048828, "learning_rate": 5.189932840726486e-07, "loss": -0.02, "num_tokens": 313499205.0, "reward": 0.43167340755462646, "reward_std": 0.06233254447579384, "rewards/gemini_judge_reward_func/mean": 0.1160714253783226, "rewards/gemini_judge_reward_func/std": 0.23540008068084717, "rewards/semantic_correctness_reward_func/mean": 0.433661550283432, "rewards/semantic_correctness_reward_func/std": 0.20578143000602722, "rewards/xmlcount_reward_func/mean": 0.7462812662124634, "rewards/xmlcount_reward_func/std": 0.4369716942310333, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 153.95982360839844, "completions/mean_terminated_length": 150.0583038330078, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.30079808800307284, "grad_norm": 0.020293962210416794, "kl": 0.010478973388671875, "learning_rate": 5.116822208206396e-07, "loss": -0.0048, "num_tokens": 313856116.0, "reward": 0.48650670051574707, "reward_std": 0.07343684136867523, "rewards/gemini_judge_reward_func/mean": 0.1540178507566452, "rewards/gemini_judge_reward_func/std": 0.26096048951148987, "rewards/semantic_correctness_reward_func/mean": 0.41867607831954956, "rewards/semantic_correctness_reward_func/std": 0.22737175226211548, "rewards/xmlcount_reward_func/mean": 0.8529108166694641, "rewards/xmlcount_reward_func/std": 0.3554360866546631, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 152.30357360839844, "completions/mean_terminated_length": 144.45045471191406, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.3011395160257778, "grad_norm": 0.021946420893073082, "kl": 0.011649847030639648, "learning_rate": 5.044202410927707e-07, "loss": -0.023, "num_tokens": 314198824.0, "reward": 0.5047093629837036, "reward_std": 0.06970416009426117, "rewards/gemini_judge_reward_func/mean": 0.2142857164144516, "rewards/gemini_judge_reward_func/std": 0.31650617718696594, "rewards/semantic_correctness_reward_func/mean": 0.4862251281738281, "rewards/semantic_correctness_reward_func/std": 0.24020838737487793, "rewards/xmlcount_reward_func/mean": 0.8043750524520874, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 163.53125, "completions/mean_terminated_length": 151.85069274902344, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.3014809440484828, "grad_norm": 0.01995939202606678, "kl": 0.013335943222045898, "learning_rate": 4.972074243048896e-07, "loss": -0.0016, "num_tokens": 314553251.0, "reward": 0.4273732602596283, "reward_std": 0.05411674082279205, "rewards/gemini_judge_reward_func/mean": 0.1205357164144516, "rewards/gemini_judge_reward_func/std": 0.2527475357055664, "rewards/semantic_correctness_reward_func/mean": 0.39429473876953125, "rewards/semantic_correctness_reward_func/std": 0.22735366225242615, "rewards/xmlcount_reward_func/mean": 0.7507500648498535, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 1024.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 185.68304443359375, "completions/mean_terminated_length": 154.63426208496094, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.30182237207118773, "grad_norm": 0.018948886543512344, "kl": 0.013762235641479492, "learning_rate": 4.900438493352056e-07, "loss": 0.0087, "num_tokens": 314895628.0, "reward": 0.47533664107322693, "reward_std": 0.06594642996788025, "rewards/gemini_judge_reward_func/mean": 0.15625, "rewards/gemini_judge_reward_func/std": 0.2843547463417053, "rewards/semantic_correctness_reward_func/mean": 0.45543310046195984, "rewards/semantic_correctness_reward_func/std": 0.23065660893917084, "rewards/xmlcount_reward_func/mean": 0.8043751120567322, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 162.8794708251953, "completions/mean_terminated_length": 147.22271728515625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.3021638000938927, "grad_norm": 0.022125041112303734, "kl": 0.012331485748291016, "learning_rate": 4.829295945234258e-07, "loss": -0.0208, "num_tokens": 315235093.0, "reward": 0.45227399468421936, "reward_std": 0.07823660224676132, "rewards/gemini_judge_reward_func/mean": 0.1462053507566452, "rewards/gemini_judge_reward_func/std": 0.24740150570869446, "rewards/semantic_correctness_reward_func/mean": 0.45180743932724, "rewards/semantic_correctness_reward_func/std": 0.20693226158618927, "rewards/xmlcount_reward_func/mean": 0.7585759162902832, "rewards/xmlcount_reward_func/std": 0.42353785037994385, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 180.0491180419922, "completions/mean_terminated_length": 152.82489013671875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.3025052281165977, "grad_norm": 0.01876814290881157, "kl": 0.015291213989257812, "learning_rate": 4.758647376699033e-07, "loss": -0.0157, "num_tokens": 315648760.0, "reward": 0.37088969349861145, "reward_std": 0.06254278868436813, "rewards/gemini_judge_reward_func/mean": 0.1227678582072258, "rewards/gemini_judge_reward_func/std": 0.2414351999759674, "rewards/semantic_correctness_reward_func/mean": 0.4635911285877228, "rewards/semantic_correctness_reward_func/std": 0.21879442036151886, "rewards/xmlcount_reward_func/mean": 0.5726607441902161, "rewards/xmlcount_reward_func/std": 0.49444088339805603, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 769.0, "completions/mean_length": 158.46429443359375, "completions/mean_terminated_length": 150.6666717529297, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.3028466561393026, "grad_norm": 0.019580967724323273, "kl": 0.01501321792602539, "learning_rate": 4.6884935603477733e-07, "loss": -0.0064, "num_tokens": 316007392.0, "reward": 0.4427121579647064, "reward_std": 0.07204774022102356, "rewards/gemini_judge_reward_func/mean": 0.1607142835855484, "rewards/gemini_judge_reward_func/std": 0.28283706307411194, "rewards/semantic_correctness_reward_func/mean": 0.4430249035358429, "rewards/semantic_correctness_reward_func/std": 0.24078968167304993, "rewards/xmlcount_reward_func/mean": 0.7245535850524902, "rewards/xmlcount_reward_func/std": 0.44617968797683716, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 174.3794708251953, "completions/mean_terminated_length": 146.97235107421875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.3031880841620076, "grad_norm": 0.020197952166199684, "kl": 0.01589512825012207, "learning_rate": 4.6188352633713964e-07, "loss": -0.005, "num_tokens": 316382981.0, "reward": 0.4027620255947113, "reward_std": 0.06222005560994148, "rewards/gemini_judge_reward_func/mean": 0.109375, "rewards/gemini_judge_reward_func/std": 0.23385359346866608, "rewards/semantic_correctness_reward_func/mean": 0.40059566497802734, "rewards/semantic_correctness_reward_func/std": 0.21668115258216858, "rewards/xmlcount_reward_func/mean": 0.697232186794281, "rewards/xmlcount_reward_func/std": 0.5156688094139099, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 170.54464721679688, "completions/mean_terminated_length": 147.05503845214844, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.3035295121847126, "grad_norm": 0.020838545635342598, "kl": 0.01421976089477539, "learning_rate": 4.549673247541875e-07, "loss": 0.0222, "num_tokens": 316754535.0, "reward": 0.4516361355781555, "reward_std": 0.060274988412857056, "rewards/gemini_judge_reward_func/mean": 0.1551339328289032, "rewards/gemini_judge_reward_func/std": 0.2723967432975769, "rewards/semantic_correctness_reward_func/mean": 0.4464126229286194, "rewards/semantic_correctness_reward_func/std": 0.22379936277866364, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 155.49554443359375, "completions/mean_terminated_length": 151.6009063720703, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.3038709402074175, "grad_norm": 0.022012127563357353, "kl": 0.013429880142211914, "learning_rate": 4.48100826920394e-07, "loss": -0.0292, "num_tokens": 317093814.0, "reward": 0.452115535736084, "reward_std": 0.07477650046348572, "rewards/gemini_judge_reward_func/mean": 0.1417410671710968, "rewards/gemini_judge_reward_func/std": 0.2396954447031021, "rewards/semantic_correctness_reward_func/mean": 0.43247920274734497, "rewards/semantic_correctness_reward_func/std": 0.19742132723331451, "rewards/xmlcount_reward_func/mean": 0.7723080515861511, "rewards/xmlcount_reward_func/std": 0.41869527101516724, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 171.50894165039062, "completions/mean_terminated_length": 152.045654296875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.3042123682301225, "grad_norm": 0.023126568645238876, "kl": 0.01566934585571289, "learning_rate": 4.412841079266778e-07, "loss": -0.0598, "num_tokens": 317443944.0, "reward": 0.4357885718345642, "reward_std": 0.07174669206142426, "rewards/gemini_judge_reward_func/mean": 0.1272321492433548, "rewards/gemini_judge_reward_func/std": 0.23674629628658295, "rewards/semantic_correctness_reward_func/mean": 0.42297855019569397, "rewards/semantic_correctness_reward_func/std": 0.2082798182964325, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 153.9419708251953, "completions/mean_terminated_length": 142.1312255859375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.30455379625282747, "grad_norm": 0.018571894615888596, "kl": 0.010985612869262695, "learning_rate": 4.345172423195865e-07, "loss": 0.0101, "num_tokens": 317798447.0, "reward": 0.41238030791282654, "reward_std": 0.061363838613033295, "rewards/gemini_judge_reward_func/mean": 0.1328125, "rewards/gemini_judge_reward_func/std": 0.23898446559906006, "rewards/semantic_correctness_reward_func/mean": 0.4377765357494354, "rewards/semantic_correctness_reward_func/std": 0.21453407406806946, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 154.39732360839844, "completions/mean_terminated_length": 142.5927734375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.3048952242755324, "grad_norm": 0.020391054451465607, "kl": 0.017475605010986328, "learning_rate": 4.27800304100478e-07, "loss": 0.0157, "num_tokens": 318175080.0, "reward": 0.43031346797943115, "reward_std": 0.06380556523799896, "rewards/gemini_judge_reward_func/mean": 0.140625, "rewards/gemini_judge_reward_func/std": 0.2511883080005646, "rewards/semantic_correctness_reward_func/mean": 0.4581742286682129, "rewards/semantic_correctness_reward_func/std": 0.23177775740623474, "rewards/xmlcount_reward_func/mean": 0.7060714364051819, "rewards/xmlcount_reward_func/std": 0.4524170458316803, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 168.71875, "completions/mean_terminated_length": 153.16818237304688, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3052366522982374, "grad_norm": 0.02003057673573494, "kl": 0.013791561126708984, "learning_rate": 4.211333667247125e-07, "loss": -0.0267, "num_tokens": 318535501.0, "reward": 0.44697102904319763, "reward_std": 0.06763234734535217, "rewards/gemini_judge_reward_func/mean": 0.1506696492433548, "rewards/gemini_judge_reward_func/std": 0.269756555557251, "rewards/semantic_correctness_reward_func/mean": 0.4677656590938568, "rewards/semantic_correctness_reward_func/std": 0.21983444690704346, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 156.6919708251953, "completions/mean_terminated_length": 144.91856384277344, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.30557808032094236, "grad_norm": 0.020025836303830147, "kl": 0.01305532455444336, "learning_rate": 4.1451650310085076e-07, "loss": -0.0072, "num_tokens": 318903816.0, "reward": 0.44203463196754456, "reward_std": 0.06274479627609253, "rewards/gemini_judge_reward_func/mean": 0.1216517835855484, "rewards/gemini_judge_reward_func/std": 0.2460220754146576, "rewards/semantic_correctness_reward_func/mean": 0.46536940336227417, "rewards/semantic_correctness_reward_func/std": 0.21820604801177979, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 164.19644165039062, "completions/mean_terminated_length": 152.52488708496094, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.3059195083436473, "grad_norm": 0.020183706656098366, "kl": 0.014451742172241211, "learning_rate": 4.079497855898501e-07, "loss": -0.0157, "num_tokens": 319248500.0, "reward": 0.4057226777076721, "reward_std": 0.05680568888783455, "rewards/gemini_judge_reward_func/mean": 0.0904017835855484, "rewards/gemini_judge_reward_func/std": 0.20755748450756073, "rewards/semantic_correctness_reward_func/mean": 0.4178095757961273, "rewards/semantic_correctness_reward_func/std": 0.18284156918525696, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 165.16519165039062, "completions/mean_terminated_length": 153.5067901611328, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.30626093636635227, "grad_norm": 0.0204419307410717, "kl": 0.013719558715820312, "learning_rate": 4.01433286004283e-07, "loss": -0.0104, "num_tokens": 319595289.0, "reward": 0.47336649894714355, "reward_std": 0.07261441648006439, "rewards/gemini_judge_reward_func/mean": 0.1261160671710968, "rewards/gemini_judge_reward_func/std": 0.23793506622314453, "rewards/semantic_correctness_reward_func/mean": 0.4623859226703644, "rewards/semantic_correctness_reward_func/std": 0.20957084000110626, "rewards/xmlcount_reward_func/mean": 0.8261072039604187, "rewards/xmlcount_reward_func/std": 0.37810245156288147, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 157.9107208251953, "completions/mean_terminated_length": 150.1081085205078, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.30660236438905725, "grad_norm": 0.021409234032034874, "kl": 0.012285232543945312, "learning_rate": 3.949670756075447e-07, "loss": -0.0011, "num_tokens": 319936469.0, "reward": 0.49861517548561096, "reward_std": 0.05451667681336403, "rewards/gemini_judge_reward_func/mean": 0.1473214328289032, "rewards/gemini_judge_reward_func/std": 0.21190352737903595, "rewards/semantic_correctness_reward_func/mean": 0.4489149749279022, "rewards/semantic_correctness_reward_func/std": 0.19751466810703278, "rewards/xmlcount_reward_func/mean": 0.8747590184211731, "rewards/xmlcount_reward_func/std": 0.3317887485027313, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 176.21429443359375, "completions/mean_terminated_length": 156.8584442138672, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.3069437924117622, "grad_norm": 0.020069124177098274, "kl": 0.012425422668457031, "learning_rate": 3.885512251130763e-07, "loss": -0.0324, "num_tokens": 320298373.0, "reward": 0.43729156255722046, "reward_std": 0.05971502512693405, "rewards/gemini_judge_reward_func/mean": 0.1138392835855484, "rewards/gemini_judge_reward_func/std": 0.19918103516101837, "rewards/semantic_correctness_reward_func/mean": 0.43493086099624634, "rewards/semantic_correctness_reward_func/std": 0.1836792379617691, "rewards/xmlcount_reward_func/mean": 0.7619242072105408, "rewards/xmlcount_reward_func/std": 0.4237978458404541, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 160.74554443359375, "completions/mean_terminated_length": 145.04998779296875, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.30728522043446715, "grad_norm": 0.02074616402387619, "kl": 0.015354156494140625, "learning_rate": 3.8218580468359136e-07, "loss": 0.0332, "num_tokens": 320666908.0, "reward": 0.40854790806770325, "reward_std": 0.061800092458724976, "rewards/gemini_judge_reward_func/mean": 0.1049107164144516, "rewards/gemini_judge_reward_func/std": 0.21475397050380707, "rewards/semantic_correctness_reward_func/mean": 0.43866798281669617, "rewards/semantic_correctness_reward_func/std": 0.2061757743358612, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903892993927, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 152.12054443359375, "completions/mean_terminated_length": 144.26576232910156, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.30762664845717214, "grad_norm": 0.021649349480867386, "kl": 0.014882326126098633, "learning_rate": 3.7587088393030604e-07, "loss": 0.0046, "num_tokens": 321007971.0, "reward": 0.4334132671356201, "reward_std": 0.06561165302991867, "rewards/gemini_judge_reward_func/mean": 0.09375, "rewards/gemini_judge_reward_func/std": 0.20809029042720795, "rewards/semantic_correctness_reward_func/mean": 0.4065660834312439, "rewards/semantic_correctness_reward_func/std": 0.18776313960552216, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 752.0, "completions/mean_length": 170.97769165039062, "completions/mean_terminated_length": 143.46083068847656, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.30796807647987706, "grad_norm": 0.021009381860494614, "kl": 0.014513492584228516, "learning_rate": 3.6960653191218333e-07, "loss": -0.0344, "num_tokens": 321386086.0, "reward": 0.377293199300766, "reward_std": 0.05772269144654274, "rewards/gemini_judge_reward_func/mean": 0.0904017835855484, "rewards/gemini_judge_reward_func/std": 0.1935839205980301, "rewards/semantic_correctness_reward_func/mean": 0.38291215896606445, "rewards/semantic_correctness_reward_func/std": 0.17295578122138977, "rewards/xmlcount_reward_func/mean": 0.6613750457763672, "rewards/xmlcount_reward_func/std": 0.47500187158584595, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 177.37054443359375, "completions/mean_terminated_length": 150.05990600585938, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.30830950450258204, "grad_norm": 0.019631680101156235, "kl": 0.011731147766113281, "learning_rate": 3.6339281713517304e-07, "loss": -0.0512, "num_tokens": 321745689.0, "reward": 0.4260324239730835, "reward_std": 0.07654067873954773, "rewards/gemini_judge_reward_func/mean": 0.1238839253783226, "rewards/gemini_judge_reward_func/std": 0.24943633377552032, "rewards/semantic_correctness_reward_func/mean": 0.4166439175605774, "rewards/semantic_correctness_reward_func/std": 0.1954047828912735, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 170.52679443359375, "completions/mean_terminated_length": 151.0410919189453, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.308650932525287, "grad_norm": 0.02067430503666401, "kl": 0.012658357620239258, "learning_rate": 3.572298075514652e-07, "loss": 0.0104, "num_tokens": 322102323.0, "reward": 0.48799073696136475, "reward_std": 0.07766105234622955, "rewards/gemini_judge_reward_func/mean": 0.2243303507566452, "rewards/gemini_judge_reward_func/std": 0.32230180501937866, "rewards/semantic_correctness_reward_func/mean": 0.4897927939891815, "rewards/semantic_correctness_reward_func/std": 0.24560298025608063, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 150.11607360839844, "completions/mean_terminated_length": 138.25340270996094, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.30899236054799195, "grad_norm": 0.020890070125460625, "kl": 0.016417980194091797, "learning_rate": 3.511175705587433e-07, "loss": 0.0049, "num_tokens": 322448521.0, "reward": 0.45954275131225586, "reward_std": 0.061038125306367874, "rewards/gemini_judge_reward_func/mean": 0.1517857164144516, "rewards/gemini_judge_reward_func/std": 0.28382623195648193, "rewards/semantic_correctness_reward_func/mean": 0.4568919539451599, "rewards/semantic_correctness_reward_func/std": 0.22153709828853607, "rewards/xmlcount_reward_func/mean": 0.7686249613761902, "rewards/xmlcount_reward_func/std": 0.42356839776039124, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 164.74554443359375, "completions/mean_terminated_length": 149.1227264404297, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.30933378857069693, "grad_norm": 0.021773984655737877, "kl": 0.015863895416259766, "learning_rate": 3.450561729994534e-07, "loss": -0.0183, "num_tokens": 322804444.0, "reward": 0.41062042117118835, "reward_std": 0.06523442268371582, "rewards/gemini_judge_reward_func/mean": 0.1026785746216774, "rewards/gemini_judge_reward_func/std": 0.2259845733642578, "rewards/semantic_correctness_reward_func/mean": 0.45349493622779846, "rewards/semantic_correctness_reward_func/std": 0.21488092839717865, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903892993927, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 158.84375, "completions/mean_terminated_length": 147.09954833984375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.3096752165934019, "grad_norm": 0.020778290927410126, "kl": 0.015050888061523438, "learning_rate": 3.390456811600673e-07, "loss": -0.0159, "num_tokens": 323163273.0, "reward": 0.4392737150192261, "reward_std": 0.07165578007698059, "rewards/gemini_judge_reward_func/mean": 0.1149553582072258, "rewards/gemini_judge_reward_func/std": 0.232961043715477, "rewards/semantic_correctness_reward_func/mean": 0.4113505184650421, "rewards/semantic_correctness_reward_func/std": 0.19492052495479584, "rewards/xmlcount_reward_func/mean": 0.7775535583496094, "rewards/xmlcount_reward_func/std": 0.417745977640152, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 147.1919708251953, "completions/mean_terminated_length": 143.26010131835938, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.31001664461610684, "grad_norm": 0.020663078874349594, "kl": 0.01226496696472168, "learning_rate": 3.3308616077036113e-07, "loss": 0.0115, "num_tokens": 323509436.0, "reward": 0.4209424555301666, "reward_std": 0.0577840618789196, "rewards/gemini_judge_reward_func/mean": 0.109375, "rewards/gemini_judge_reward_func/std": 0.22154484689235687, "rewards/semantic_correctness_reward_func/mean": 0.4202120304107666, "rewards/semantic_correctness_reward_func/std": 0.20789223909378052, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 157.61607360839844, "completions/mean_terminated_length": 145.85520935058594, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.3103580726388118, "grad_norm": 0.02018766477704048, "kl": 0.012340068817138672, "learning_rate": 3.271776770026963e-07, "loss": -0.0238, "num_tokens": 323846074.0, "reward": 0.43793949484825134, "reward_std": 0.04598357900977135, "rewards/gemini_judge_reward_func/mean": 0.0970982164144516, "rewards/gemini_judge_reward_func/std": 0.20721961557865143, "rewards/semantic_correctness_reward_func/mean": 0.3867507874965668, "rewards/semantic_correctness_reward_func/std": 0.20682360231876373, "rewards/xmlcount_reward_func/mean": 0.8043751120567322, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 152.21429443359375, "completions/mean_terminated_length": 144.36036682128906, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.3106995006615168, "grad_norm": 0.019882354885339737, "kl": 0.012122154235839844, "learning_rate": 3.213202944713023e-07, "loss": 0.001, "num_tokens": 324217038.0, "reward": 0.43322232365608215, "reward_std": 0.05336981639266014, "rewards/gemini_judge_reward_func/mean": 0.1372767835855484, "rewards/gemini_judge_reward_func/std": 0.25469791889190674, "rewards/semantic_correctness_reward_func/mean": 0.4615578353404999, "rewards/semantic_correctness_reward_func/std": 0.2347070872783661, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 164.65179443359375, "completions/mean_terminated_length": 152.98643493652344, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.3110409286842218, "grad_norm": 0.01976594142615795, "kl": 0.011294364929199219, "learning_rate": 3.1551407723157734e-07, "loss": 0.0006, "num_tokens": 324552060.0, "reward": 0.4899020493030548, "reward_std": 0.06357000023126602, "rewards/gemini_judge_reward_func/mean": 0.1662946492433548, "rewards/gemini_judge_reward_func/std": 0.2800786793231964, "rewards/semantic_correctness_reward_func/mean": 0.481358140707016, "rewards/semantic_correctness_reward_func/std": 0.20851466059684753, "rewards/xmlcount_reward_func/mean": 0.8177813291549683, "rewards/xmlcount_reward_func/std": 0.3879494369029999, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 164.8125, "completions/mean_terminated_length": 153.14932250976562, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.3113823567069267, "grad_norm": 0.01866706646978855, "kl": 0.013501882553100586, "learning_rate": 3.0975908877938277e-07, "loss": 0.0139, "num_tokens": 324893338.0, "reward": 0.47727853059768677, "reward_std": 0.06310079991817474, "rewards/gemini_judge_reward_func/mean": 0.1763392835855484, "rewards/gemini_judge_reward_func/std": 0.2794100046157837, "rewards/semantic_correctness_reward_func/mean": 0.46071383357048035, "rewards/semantic_correctness_reward_func/std": 0.22256124019622803, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 172.96429443359375, "completions/mean_terminated_length": 149.54127502441406, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.3117237847296317, "grad_norm": 0.021747853606939316, "kl": 0.011886119842529297, "learning_rate": 3.040553920503503e-07, "loss": 0.0192, "num_tokens": 325232410.0, "reward": 0.449485719203949, "reward_std": 0.0661468431353569, "rewards/gemini_judge_reward_func/mean": 0.1171875, "rewards/gemini_judge_reward_func/std": 0.24131856858730316, "rewards/semantic_correctness_reward_func/mean": 0.4311159551143646, "rewards/semantic_correctness_reward_func/std": 0.2181245982646942, "rewards/xmlcount_reward_func/mean": 0.7909687757492065, "rewards/xmlcount_reward_func/std": 0.4050505757331848, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 596.0, "completions/mean_length": 173.61607360839844, "completions/mean_terminated_length": 150.21099853515625, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.3120652127523367, "grad_norm": 0.0281588863581419, "kl": 0.01603221893310547, "learning_rate": 2.984030494191942e-07, "loss": 0.0035, "num_tokens": 325576556.0, "reward": 0.47375205159187317, "reward_std": 0.06507349759340286, "rewards/gemini_judge_reward_func/mean": 0.1450892835855484, "rewards/gemini_judge_reward_func/std": 0.25965309143066406, "rewards/semantic_correctness_reward_func/mean": 0.46983152627944946, "rewards/semantic_correctness_reward_func/std": 0.21551068127155304, "rewards/xmlcount_reward_func/mean": 0.8043750524520874, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 151.9866180419922, "completions/mean_terminated_length": 136.13182067871094, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.3124066407750416, "grad_norm": 0.023043038323521614, "kl": 0.019103288650512695, "learning_rate": 2.928021226990263e-07, "loss": -0.0209, "num_tokens": 325931713.0, "reward": 0.4459460973739624, "reward_std": 0.058497704565525055, "rewards/gemini_judge_reward_func/mean": 0.140625, "rewards/gemini_judge_reward_func/std": 0.26844772696495056, "rewards/semantic_correctness_reward_func/mean": 0.44698023796081543, "rewards/semantic_correctness_reward_func/std": 0.2093106359243393, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 148.45089721679688, "completions/mean_terminated_length": 144.52467346191406, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.3127480687977466, "grad_norm": 0.021820751950144768, "kl": 0.01890873908996582, "learning_rate": 2.8725267314068496e-07, "loss": -0.0219, "num_tokens": 326264278.0, "reward": 0.424966424703598, "reward_std": 0.04606298357248306, "rewards/gemini_judge_reward_func/mean": 0.078125, "rewards/gemini_judge_reward_func/std": 0.18801312148571014, "rewards/semantic_correctness_reward_func/mean": 0.43133196234703064, "rewards/semantic_correctness_reward_func/std": 0.19531051814556122, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 162.04019165039062, "completions/mean_terminated_length": 146.36817932128906, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.31308949682045156, "grad_norm": 0.020182453095912933, "kl": 0.01574563980102539, "learning_rate": 2.817547614320615e-07, "loss": 0.0028, "num_tokens": 326636615.0, "reward": 0.4213365912437439, "reward_std": 0.0604686439037323, "rewards/gemini_judge_reward_func/mean": 0.1629464328289032, "rewards/gemini_judge_reward_func/std": 0.2845219075679779, "rewards/semantic_correctness_reward_func/mean": 0.4580489695072174, "rewards/semantic_correctness_reward_func/std": 0.2246883362531662, "rewards/xmlcount_reward_func/mean": 0.661370575428009, "rewards/xmlcount_reward_func/std": 0.47499868273735046, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 164.82589721679688, "completions/mean_terminated_length": 149.2045440673828, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.3134309248431565, "grad_norm": 0.0189261082559824, "kl": 0.017313480377197266, "learning_rate": 2.763084476974376e-07, "loss": 0.0112, "num_tokens": 327019208.0, "reward": 0.4103511869907379, "reward_std": 0.07770156115293503, "rewards/gemini_judge_reward_func/mean": 0.1261160671710968, "rewards/gemini_judge_reward_func/std": 0.2615041136741638, "rewards/semantic_correctness_reward_func/mean": 0.44102367758750916, "rewards/semantic_correctness_reward_func/std": 0.22343264520168304, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 172.2544708251953, "completions/mean_terminated_length": 148.81192016601562, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.31377235286586147, "grad_norm": 0.021766338497400284, "kl": 0.014467239379882812, "learning_rate": 2.7091379149682683e-07, "loss": -0.033, "num_tokens": 327396877.0, "reward": 0.40158364176750183, "reward_std": 0.054425518959760666, "rewards/gemini_judge_reward_func/mean": 0.0881696417927742, "rewards/gemini_judge_reward_func/std": 0.18727631866931915, "rewards/semantic_correctness_reward_func/mean": 0.4373288154602051, "rewards/semantic_correctness_reward_func/std": 0.19881200790405273, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903594970703, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 161.6294708251953, "completions/mean_terminated_length": 149.92308044433594, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.31411378088856645, "grad_norm": 0.020311061292886734, "kl": 0.01554250717163086, "learning_rate": 2.655708518253258e-07, "loss": -0.0004, "num_tokens": 327761186.0, "reward": 0.46613025665283203, "reward_std": 0.06654452532529831, "rewards/gemini_judge_reward_func/mean": 0.1171875, "rewards/gemini_judge_reward_func/std": 0.23184122145175934, "rewards/semantic_correctness_reward_func/mean": 0.4249636232852936, "rewards/semantic_correctness_reward_func/std": 0.2204969823360443, "rewards/xmlcount_reward_func/mean": 0.8356562852859497, "rewards/xmlcount_reward_func/std": 0.3725454807281494, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 160.41964721679688, "completions/mean_terminated_length": 140.70318603515625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3144552089112714, "grad_norm": 0.019982021301984787, "kl": 0.0161285400390625, "learning_rate": 2.602796871124663e-07, "loss": -0.0086, "num_tokens": 328124864.0, "reward": 0.40349289774894714, "reward_std": 0.08313170075416565, "rewards/gemini_judge_reward_func/mean": 0.1439732164144516, "rewards/gemini_judge_reward_func/std": 0.2586561143398285, "rewards/semantic_correctness_reward_func/mean": 0.43135711550712585, "rewards/semantic_correctness_reward_func/std": 0.21311675012111664, "rewards/xmlcount_reward_func/mean": 0.6490803956985474, "rewards/xmlcount_reward_func/std": 0.47694966197013855, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 158.6607208251953, "completions/mean_terminated_length": 146.91403198242188, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.31479663693397636, "grad_norm": 0.01935208961367607, "kl": 0.01622152328491211, "learning_rate": 2.5504035522157853e-07, "loss": -0.0028, "num_tokens": 328461724.0, "reward": 0.4391644597053528, "reward_std": 0.0541006401181221, "rewards/gemini_judge_reward_func/mean": 0.1205357164144516, "rewards/gemini_judge_reward_func/std": 0.24600426852703094, "rewards/semantic_correctness_reward_func/mean": 0.417500764131546, "rewards/semantic_correctness_reward_func/std": 0.20058931410312653, "rewards/xmlcount_reward_func/mean": 0.7686249613761902, "rewards/xmlcount_reward_func/std": 0.42356839776039124, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 154.89732360839844, "completions/mean_terminated_length": 147.06756591796875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.31513806495668134, "grad_norm": 0.020716093480587006, "kl": 0.01778697967529297, "learning_rate": 2.4985291344915675e-07, "loss": 0.0092, "num_tokens": 328833597.0, "reward": 0.441595196723938, "reward_std": 0.06813618540763855, "rewards/gemini_judge_reward_func/mean": 0.1082589253783226, "rewards/gemini_judge_reward_func/std": 0.23377598822116852, "rewards/semantic_correctness_reward_func/mean": 0.4209490418434143, "rewards/semantic_correctness_reward_func/std": 0.21312111616134644, "rewards/xmlcount_reward_func/mean": 0.7852544784545898, "rewards/xmlcount_reward_func/std": 0.41142624616622925, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 157.35269165039062, "completions/mean_terminated_length": 141.59544372558594, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.31547949297938627, "grad_norm": 0.02100345492362976, "kl": 0.016232967376708984, "learning_rate": 2.447174185242324e-07, "loss": -0.0323, "num_tokens": 329175456.0, "reward": 0.4355000853538513, "reward_std": 0.06009732559323311, "rewards/gemini_judge_reward_func/mean": 0.1450892835855484, "rewards/gemini_judge_reward_func/std": 0.278405100107193, "rewards/semantic_correctness_reward_func/mean": 0.4573217034339905, "rewards/semantic_correctness_reward_func/std": 0.20487044751644135, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 159.29019165039062, "completions/mean_terminated_length": 147.5520477294922, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.31582092100209125, "grad_norm": 0.01990508660674095, "kl": 0.015584707260131836, "learning_rate": 2.3963392660775576e-07, "loss": 0.0066, "num_tokens": 329545373.0, "reward": 0.4258612096309662, "reward_std": 0.05928758531808853, "rewards/gemini_judge_reward_func/mean": 0.1595982164144516, "rewards/gemini_judge_reward_func/std": 0.3021363317966461, "rewards/semantic_correctness_reward_func/mean": 0.41585955023765564, "rewards/semantic_correctness_reward_func/std": 0.20218245685100555, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903892993927, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 171.37501525878906, "completions/mean_terminated_length": 143.8709716796875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.31616234902479623, "grad_norm": 0.020098304376006126, "kl": 0.015714406967163086, "learning_rate": 2.3460249329197825e-07, "loss": 0.024, "num_tokens": 329909241.0, "reward": 0.4246380925178528, "reward_std": 0.0676727145910263, "rewards/gemini_judge_reward_func/mean": 0.1383928507566452, "rewards/gemini_judge_reward_func/std": 0.2524305284023285, "rewards/semantic_correctness_reward_func/mean": 0.4476812779903412, "rewards/semantic_correctness_reward_func/std": 0.21570633351802826, "rewards/xmlcount_reward_func/mean": 0.6993616819381714, "rewards/xmlcount_reward_func/std": 0.4566512405872345, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 168.4553680419922, "completions/mean_terminated_length": 144.9082489013672, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.31650377704750116, "grad_norm": 0.019722236320376396, "kl": 0.012249469757080078, "learning_rate": 2.296231735998511e-07, "loss": -0.02, "num_tokens": 330249435.0, "reward": 0.4518135190010071, "reward_std": 0.06725968420505524, "rewards/gemini_judge_reward_func/mean": 0.1484375, "rewards/gemini_judge_reward_func/std": 0.26996058225631714, "rewards/semantic_correctness_reward_func/mean": 0.424942284822464, "rewards/semantic_correctness_reward_func/std": 0.21814289689064026, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 158.0491180419922, "completions/mean_terminated_length": 150.2477569580078, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.31684520507020614, "grad_norm": 0.02072775363922119, "kl": 0.012631654739379883, "learning_rate": 2.2469602198441575e-07, "loss": 0.007, "num_tokens": 330590598.0, "reward": 0.4358016848564148, "reward_std": 0.06530667841434479, "rewards/gemini_judge_reward_func/mean": 0.1261160671710968, "rewards/gemini_judge_reward_func/std": 0.24027937650680542, "rewards/semantic_correctness_reward_func/mean": 0.3984636962413788, "rewards/semantic_correctness_reward_func/std": 0.20566481351852417, "rewards/xmlcount_reward_func/mean": 0.7641563415527344, "rewards/xmlcount_reward_func/std": 0.4263768792152405, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 169.29019165039062, "completions/mean_terminated_length": 149.7762451171875, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.3171866330929111, "grad_norm": 0.02100742794573307, "kl": 0.01338648796081543, "learning_rate": 2.198210923282118e-07, "loss": 0.0037, "num_tokens": 330960599.0, "reward": 0.46286967396736145, "reward_std": 0.07256618142127991, "rewards/gemini_judge_reward_func/mean": 0.1383928507566452, "rewards/gemini_judge_reward_func/std": 0.250200092792511, "rewards/semantic_correctness_reward_func/mean": 0.4306160509586334, "rewards/semantic_correctness_reward_func/std": 0.21207194030284882, "rewards/xmlcount_reward_func/mean": 0.803473174571991, "rewards/xmlcount_reward_func/std": 0.39635196328163147, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 169.4375, "completions/mean_terminated_length": 145.91741943359375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.31752806111561604, "grad_norm": 0.019842946901917458, "kl": 0.013571023941040039, "learning_rate": 2.149984379426906e-07, "loss": -0.0122, "num_tokens": 331315505.0, "reward": 0.4548283517360687, "reward_std": 0.06730558723211288, "rewards/gemini_judge_reward_func/mean": 0.1439732164144516, "rewards/gemini_judge_reward_func/std": 0.2682309150695801, "rewards/semantic_correctness_reward_func/mean": 0.4489452540874481, "rewards/semantic_correctness_reward_func/std": 0.20901791751384735, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 156.74107360839844, "completions/mean_terminated_length": 144.9683380126953, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.317869489138321, "grad_norm": 0.019857613369822502, "kl": 0.014861583709716797, "learning_rate": 2.102281115676258e-07, "loss": -0.0096, "num_tokens": 331661999.0, "reward": 0.4678274691104889, "reward_std": 0.07853475958108902, "rewards/gemini_judge_reward_func/mean": 0.1529017835855484, "rewards/gemini_judge_reward_func/std": 0.2736615538597107, "rewards/semantic_correctness_reward_func/mean": 0.4782174527645111, "rewards/semantic_correctness_reward_func/std": 0.2053535431623459, "rewards/xmlcount_reward_func/mean": 0.7775580286979675, "rewards/xmlcount_reward_func/std": 0.41774842143058777, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 166.09375, "completions/mean_terminated_length": 150.49545288085938, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.318210917161026, "grad_norm": 0.019011061638593674, "kl": 0.012024164199829102, "learning_rate": 2.0551016537054492e-07, "loss": -0.0194, "num_tokens": 332014572.0, "reward": 0.4248262047767639, "reward_std": 0.06209308281540871, "rewards/gemini_judge_reward_func/mean": 0.1116071417927742, "rewards/gemini_judge_reward_func/std": 0.22295227646827698, "rewards/semantic_correctness_reward_func/mean": 0.41284507513046265, "rewards/semantic_correctness_reward_func/std": 0.1855388879776001, "rewards/xmlcount_reward_func/mean": 0.7440357208251953, "rewards/xmlcount_reward_func/std": 0.43177708983421326, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 165.08482360839844, "completions/mean_terminated_length": 149.46817016601562, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.31855234518373093, "grad_norm": 0.021508535370230675, "kl": 0.018635272979736328, "learning_rate": 2.008446509461498e-07, "loss": -0.015, "num_tokens": 332385227.0, "reward": 0.4379209876060486, "reward_std": 0.055039145052433014, "rewards/gemini_judge_reward_func/mean": 0.1383928507566452, "rewards/gemini_judge_reward_func/std": 0.22042377293109894, "rewards/semantic_correctness_reward_func/mean": 0.4470691978931427, "rewards/semantic_correctness_reward_func/std": 0.2077719122171402, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 151.96429443359375, "completions/mean_terminated_length": 148.05381774902344, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.3188937732064359, "grad_norm": 0.021709749475121498, "kl": 0.016178607940673828, "learning_rate": 1.962316193157593e-07, "loss": 0.0039, "num_tokens": 332752595.0, "reward": 0.44336017966270447, "reward_std": 0.07409544289112091, "rewards/gemini_judge_reward_func/mean": 0.1495535671710968, "rewards/gemini_judge_reward_func/std": 0.2739836871623993, "rewards/semantic_correctness_reward_func/mean": 0.436318576335907, "rewards/semantic_correctness_reward_func/std": 0.24862277507781982, "rewards/xmlcount_reward_func/mean": 0.7406874895095825, "rewards/xmlcount_reward_func/std": 0.4378414452075958, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 144.76339721679688, "completions/mean_terminated_length": 136.84234619140625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.3192352012291409, "grad_norm": 0.02092774398624897, "kl": 0.015278339385986328, "learning_rate": 1.91671120926748e-07, "loss": -0.0262, "num_tokens": 333116046.0, "reward": 0.4193107783794403, "reward_std": 0.05722092092037201, "rewards/gemini_judge_reward_func/mean": 0.1294642835855484, "rewards/gemini_judge_reward_func/std": 0.24486233294010162, "rewards/semantic_correctness_reward_func/mean": 0.44337525963783264, "rewards/semantic_correctness_reward_func/std": 0.21482908725738525, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903594970703, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 162.9419708251953, "completions/mean_terminated_length": 151.25340270996094, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.3195766292518458, "grad_norm": 0.0197757575660944, "kl": 0.010863065719604492, "learning_rate": 1.871632056519962e-07, "loss": -0.0301, "num_tokens": 333433317.0, "reward": 0.49096450209617615, "reward_std": 0.07130220532417297, "rewards/gemini_judge_reward_func/mean": 0.1205357164144516, "rewards/gemini_judge_reward_func/std": 0.2295006811618805, "rewards/semantic_correctness_reward_func/mean": 0.4262508749961853, "rewards/semantic_correctness_reward_func/std": 0.22580935060977936, "rewards/xmlcount_reward_func/mean": 0.8937500715255737, "rewards/xmlcount_reward_func/std": 0.31029748916625977, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 154.36607360839844, "completions/mean_terminated_length": 142.56109619140625, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.3199180572745508, "grad_norm": 0.01896924152970314, "kl": 0.014213323593139648, "learning_rate": 1.8270792278934302e-07, "loss": 0.0075, "num_tokens": 333789967.0, "reward": 0.4371952414512634, "reward_std": 0.053081054240465164, "rewards/gemini_judge_reward_func/mean": 0.1462053507566452, "rewards/gemini_judge_reward_func/std": 0.26701533794403076, "rewards/semantic_correctness_reward_func/mean": 0.46356528997421265, "rewards/semantic_correctness_reward_func/std": 0.21834862232208252, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 163.6875, "completions/mean_terminated_length": 140.00917053222656, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.3202594852972558, "grad_norm": 0.021720534190535545, "kl": 0.012692689895629883, "learning_rate": 1.7830532106104747e-07, "loss": 0.0044, "num_tokens": 334150361.0, "reward": 0.4325564503669739, "reward_std": 0.0697154626250267, "rewards/gemini_judge_reward_func/mean": 0.1305803507566452, "rewards/gemini_judge_reward_func/std": 0.24824969470500946, "rewards/semantic_correctness_reward_func/mean": 0.435871422290802, "rewards/semantic_correctness_reward_func/std": 0.2043805867433548, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 159.69644165039062, "completions/mean_terminated_length": 143.9818115234375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.3206009133199607, "grad_norm": 0.019939295947551727, "kl": 0.015135526657104492, "learning_rate": 1.7395544861325718e-07, "loss": 0.0161, "num_tokens": 334531525.0, "reward": 0.4231032729148865, "reward_std": 0.05319977179169655, "rewards/gemini_judge_reward_func/mean": 0.1272321492433548, "rewards/gemini_judge_reward_func/std": 0.24027156829833984, "rewards/semantic_correctness_reward_func/mean": 0.42881080508232117, "rewards/semantic_correctness_reward_func/std": 0.2193426787853241, "rewards/xmlcount_reward_func/mean": 0.7161206007003784, "rewards/xmlcount_reward_func/std": 0.4517506957054138, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 159.54464721679688, "completions/mean_terminated_length": 147.80996704101562, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.3209423413426657, "grad_norm": 0.020010868087410927, "kl": 0.012491226196289062, "learning_rate": 1.696583530154794e-07, "loss": 0.0005, "num_tokens": 334851831.0, "reward": 0.48991096019744873, "reward_std": 0.0713018923997879, "rewards/gemini_judge_reward_func/mean": 0.1428571492433548, "rewards/gemini_judge_reward_func/std": 0.23844198882579803, "rewards/semantic_correctness_reward_func/mean": 0.4210367202758789, "rewards/semantic_correctness_reward_func/std": 0.23552283644676208, "rewards/xmlcount_reward_func/mean": 0.8714017868041992, "rewards/xmlcount_reward_func/std": 0.3367997407913208, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 165.13839721679688, "completions/mean_terminated_length": 137.4331817626953, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.3212837693653707, "grad_norm": 0.02136445976793766, "kl": 0.017918109893798828, "learning_rate": 1.6541408126006464e-07, "loss": -0.0136, "num_tokens": 335240942.0, "reward": 0.36811578273773193, "reward_std": 0.06013471260666847, "rewards/gemini_judge_reward_func/mean": 0.1417410671710968, "rewards/gemini_judge_reward_func/std": 0.2652287185192108, "rewards/semantic_correctness_reward_func/mean": 0.444409042596817, "rewards/semantic_correctness_reward_func/std": 0.22162111103534698, "rewards/xmlcount_reward_func/mean": 0.5563437342643738, "rewards/xmlcount_reward_func/std": 0.49736082553863525, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 184.04019165039062, "completions/mean_terminated_length": 156.9447021484375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.3216251973880756, "grad_norm": 0.01701374724507332, "kl": 0.012472152709960938, "learning_rate": 1.6122267976168783e-07, "loss": -0.0328, "num_tokens": 335593499.0, "reward": 0.425184041261673, "reward_std": 0.07142052799463272, "rewards/gemini_judge_reward_func/mean": 0.1484375, "rewards/gemini_judge_reward_func/std": 0.2593710422515869, "rewards/semantic_correctness_reward_func/mean": 0.4347950518131256, "rewards/semantic_correctness_reward_func/std": 0.2152114361524582, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903594970703, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 163.93304443359375, "completions/mean_terminated_length": 156.1846923828125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.3219666254107806, "grad_norm": 0.020743004977703094, "kl": 0.013138771057128906, "learning_rate": 1.5708419435684463e-07, "loss": -0.05, "num_tokens": 335956992.0, "reward": 0.4332004487514496, "reward_std": 0.0775144100189209, "rewards/gemini_judge_reward_func/mean": 0.1283482164144516, "rewards/gemini_judge_reward_func/std": 0.25165361166000366, "rewards/semantic_correctness_reward_func/mean": 0.40780559182167053, "rewards/semantic_correctness_reward_func/std": 0.22031456232070923, "rewards/xmlcount_reward_func/mean": 0.7507500648498535, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 175.87501525878906, "completions/mean_terminated_length": 148.51612854003906, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.32230805343348556, "grad_norm": 0.019393648952245712, "kl": 0.0162045955657959, "learning_rate": 1.5299867030334815e-07, "loss": -0.0326, "num_tokens": 336327828.0, "reward": 0.38535192608833313, "reward_std": 0.06319523602724075, "rewards/gemini_judge_reward_func/mean": 0.0982142835855484, "rewards/gemini_judge_reward_func/std": 0.20871469378471375, "rewards/semantic_correctness_reward_func/mean": 0.4343844950199127, "rewards/semantic_correctness_reward_func/std": 0.1936558037996292, "rewards/xmlcount_reward_func/mean": 0.6479731798171997, "rewards/xmlcount_reward_func/std": 0.47700217366218567, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 165.7366180419922, "completions/mean_terminated_length": 154.0859832763672, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.3226494814561905, "grad_norm": 0.019887909293174744, "kl": 0.014321565628051758, "learning_rate": 1.4896615227983468e-07, "loss": -0.0364, "num_tokens": 336685281.0, "reward": 0.4396510124206543, "reward_std": 0.05946972966194153, "rewards/gemini_judge_reward_func/mean": 0.1183035746216774, "rewards/gemini_judge_reward_func/std": 0.2142873853445053, "rewards/semantic_correctness_reward_func/mean": 0.42439767718315125, "rewards/semantic_correctness_reward_func/std": 0.2012360543012619, "rewards/xmlcount_reward_func/mean": 0.7686250805854797, "rewards/xmlcount_reward_func/std": 0.42356836795806885, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 173.7232208251953, "completions/mean_terminated_length": 154.3105010986328, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.32299090947889547, "grad_norm": 0.020988894626498222, "kl": 0.011559724807739258, "learning_rate": 1.4498668438527597e-07, "loss": 0.0017, "num_tokens": 337010607.0, "reward": 0.4864182770252228, "reward_std": 0.060431286692619324, "rewards/gemini_judge_reward_func/mean": 0.1573660671710968, "rewards/gemini_judge_reward_func/std": 0.26588836312294006, "rewards/semantic_correctness_reward_func/mean": 0.4371090531349182, "rewards/semantic_correctness_reward_func/std": 0.2134704738855362, "rewards/xmlcount_reward_func/mean": 0.8401250243186951, "rewards/xmlcount_reward_func/std": 0.3684578835964203, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 160.3794708251953, "completions/mean_terminated_length": 156.50672912597656, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.32333233750160045, "grad_norm": 0.02066732570528984, "kl": 0.013165950775146484, "learning_rate": 1.4106031013849498e-07, "loss": -0.0018, "num_tokens": 337352116.0, "reward": 0.485461950302124, "reward_std": 0.05010446533560753, "rewards/gemini_judge_reward_func/mean": 0.1183035746216774, "rewards/gemini_judge_reward_func/std": 0.2538047432899475, "rewards/semantic_correctness_reward_func/mean": 0.4389525055885315, "rewards/semantic_correctness_reward_func/std": 0.21956577897071838, "rewards/xmlcount_reward_func/mean": 0.8758750557899475, "rewards/xmlcount_reward_func/std": 0.33179107308387756, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 159.58482360839844, "completions/mean_terminated_length": 147.85069274902344, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.32367376552430543, "grad_norm": 0.020279573276638985, "kl": 0.015435457229614258, "learning_rate": 1.3718707247769137e-07, "loss": -0.0023, "num_tokens": 337685867.0, "reward": 0.4606628715991974, "reward_std": 0.06019989401102066, "rewards/gemini_judge_reward_func/mean": 0.1428571492433548, "rewards/gemini_judge_reward_func/std": 0.25873589515686035, "rewards/semantic_correctness_reward_func/mean": 0.4445998966693878, "rewards/semantic_correctness_reward_func/std": 0.19706618785858154, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 163.2544708251953, "completions/mean_terminated_length": 151.5701446533203, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.32401519354701036, "grad_norm": 0.01981574296951294, "kl": 0.013592720031738281, "learning_rate": 1.333670137599713e-07, "loss": -0.0039, "num_tokens": 338015252.0, "reward": 0.436404287815094, "reward_std": 0.06201518699526787, "rewards/gemini_judge_reward_func/mean": 0.1439732164144516, "rewards/gemini_judge_reward_func/std": 0.25207090377807617, "rewards/semantic_correctness_reward_func/mean": 0.4283246397972107, "rewards/semantic_correctness_reward_func/std": 0.2121453732252121, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 1024.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 189.2544708251953, "completions/mean_terminated_length": 158.3379669189453, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.32435662156971534, "grad_norm": 0.01981574296951294, "kl": 0.01743006706237793, "learning_rate": 1.333670137599713e-07, "loss": 0.0092, "num_tokens": 338382561.0, "reward": 0.3984151780605316, "reward_std": 0.04683025926351547, "rewards/gemini_judge_reward_func/mean": 0.1026785746216774, "rewards/gemini_judge_reward_func/std": 0.19681765139102936, "rewards/semantic_correctness_reward_func/mean": 0.41927212476730347, "rewards/semantic_correctness_reward_func/std": 0.20039360225200653, "rewards/xmlcount_reward_func/mean": 0.6837233304977417, "rewards/xmlcount_reward_func/std": 0.4643874168395996, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 157.4375, "completions/mean_terminated_length": 145.67420959472656, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.3246980495924203, "grad_norm": 0.019179075956344604, "kl": 0.011970043182373047, "learning_rate": 1.2960017576088445e-07, "loss": 0.0004, "num_tokens": 338736863.0, "reward": 0.4437229633331299, "reward_std": 0.07157056778669357, "rewards/gemini_judge_reward_func/mean": 0.1595982164144516, "rewards/gemini_judge_reward_func/std": 0.2656058371067047, "rewards/semantic_correctness_reward_func/mean": 0.43366822600364685, "rewards/semantic_correctness_reward_func/std": 0.21503110229969025, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 174.68304443359375, "completions/mean_terminated_length": 151.3073272705078, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.32503947761512525, "grad_norm": 0.019320348277688026, "kl": 0.017798185348510742, "learning_rate": 1.2588659967396998e-07, "loss": -0.0043, "num_tokens": 339097528.0, "reward": 0.4750409424304962, "reward_std": 0.08295747637748718, "rewards/gemini_judge_reward_func/mean": 0.1796875, "rewards/gemini_judge_reward_func/std": 0.29441800713539124, "rewards/semantic_correctness_reward_func/mean": 0.478579580783844, "rewards/semantic_correctness_reward_func/std": 0.2249763160943985, "rewards/xmlcount_reward_func/mean": 0.7686249613761902, "rewards/xmlcount_reward_func/std": 0.42356839776039124, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 168.25894165039062, "completions/mean_terminated_length": 148.72145080566406, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.32538090563783023, "grad_norm": 0.0192513857036829, "kl": 0.014150142669677734, "learning_rate": 1.222263261102985e-07, "loss": -0.0178, "num_tokens": 339463978.0, "reward": 0.43717771768569946, "reward_std": 0.06090007722377777, "rewards/gemini_judge_reward_func/mean": 0.1294642835855484, "rewards/gemini_judge_reward_func/std": 0.23313555121421814, "rewards/semantic_correctness_reward_func/mean": 0.4477901756763458, "rewards/semantic_correctness_reward_func/std": 0.18037478625774384, "rewards/xmlcount_reward_func/mean": 0.7395849227905273, "rewards/xmlcount_reward_func/std": 0.4368475377559662, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 1024.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 183.24554443359375, "completions/mean_terminated_length": 152.10647583007812, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.3257223336605352, "grad_norm": 0.019445307552814484, "kl": 0.013372421264648438, "learning_rate": 1.1861939509803688e-07, "loss": -0.0261, "num_tokens": 339827497.0, "reward": 0.4099394977092743, "reward_std": 0.05597153678536415, "rewards/gemini_judge_reward_func/mean": 0.0870535746216774, "rewards/gemini_judge_reward_func/std": 0.1914946585893631, "rewards/semantic_correctness_reward_func/mean": 0.4098401665687561, "rewards/semantic_correctness_reward_func/std": 0.17344339191913605, "rewards/xmlcount_reward_func/mean": 0.7328750491142273, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 165.41964721679688, "completions/mean_terminated_length": 141.7889862060547, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.32606376168324014, "grad_norm": 0.01958622597157955, "kl": 0.012957572937011719, "learning_rate": 1.1506584608200366e-07, "loss": 0.0071, "num_tokens": 340206199.0, "reward": 0.4157373011112213, "reward_std": 0.062380947172641754, "rewards/gemini_judge_reward_func/mean": 0.1316964328289032, "rewards/gemini_judge_reward_func/std": 0.2482219636440277, "rewards/semantic_correctness_reward_func/mean": 0.42104366421699524, "rewards/semantic_correctness_reward_func/std": 0.193641796708107, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903892993927, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 153.5357208251953, "completions/mean_terminated_length": 153.5357208251953, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.3264051897059451, "grad_norm": 0.0198379959911108, "kl": 0.014824390411376953, "learning_rate": 1.1156571792324212e-07, "loss": -0.0188, "num_tokens": 340538411.0, "reward": 0.4334469139575958, "reward_std": 0.06941147148609161, "rewards/gemini_judge_reward_func/mean": 0.1183035746216774, "rewards/gemini_judge_reward_func/std": 0.2294461578130722, "rewards/semantic_correctness_reward_func/mean": 0.4291272759437561, "rewards/semantic_correctness_reward_func/std": 0.22070704400539398, "rewards/xmlcount_reward_func/mean": 0.7507500648498535, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 144.6116180419922, "completions/mean_terminated_length": 140.6681671142578, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.3267466177286501, "grad_norm": 0.020753346383571625, "kl": 0.015350341796875, "learning_rate": 1.0811904889859337e-07, "loss": 0.0, "num_tokens": 340899920.0, "reward": 0.44371187686920166, "reward_std": 0.0667320117354393, "rewards/gemini_judge_reward_func/mean": 0.1540178507566452, "rewards/gemini_judge_reward_func/std": 0.2894744277000427, "rewards/semantic_correctness_reward_func/mean": 0.4805235266685486, "rewards/semantic_correctness_reward_func/std": 0.20522719621658325, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 160.96429443359375, "completions/mean_terminated_length": 149.24887084960938, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.327088045751355, "grad_norm": 0.02153160236775875, "kl": 0.020434141159057617, "learning_rate": 1.0472587670027678e-07, "loss": 0.0197, "num_tokens": 341271724.0, "reward": 0.44719889760017395, "reward_std": 0.08111313730478287, "rewards/gemini_judge_reward_func/mean": 0.1629464328289032, "rewards/gemini_judge_reward_func/std": 0.2903720736503601, "rewards/semantic_correctness_reward_func/mean": 0.46769067645072937, "rewards/semantic_correctness_reward_func/std": 0.23603184521198273, "rewards/xmlcount_reward_func/mean": 0.7212054133415222, "rewards/xmlcount_reward_func/std": 0.4433631896972656, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 162.07144165039062, "completions/mean_terminated_length": 154.30630493164062, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.32742947377406, "grad_norm": 0.01959027163684368, "kl": 0.011683225631713867, "learning_rate": 1.0138623843548078e-07, "loss": -0.0229, "num_tokens": 341623200.0, "reward": 0.47252076864242554, "reward_std": 0.07059833407402039, "rewards/gemini_judge_reward_func/mean": 0.1395089328289032, "rewards/gemini_judge_reward_func/std": 0.250137597322464, "rewards/semantic_correctness_reward_func/mean": 0.44802334904670715, "rewards/semantic_correctness_reward_func/std": 0.1806076020002365, "rewards/xmlcount_reward_func/mean": 0.8177813291549683, "rewards/xmlcount_reward_func/std": 0.3879494369029999, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 147.3303680419922, "completions/mean_terminated_length": 139.43243408203125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.327770901796765, "grad_norm": 0.022266387939453125, "kl": 0.018918991088867188, "learning_rate": 9.810017062595322e-08, "loss": -0.013, "num_tokens": 341993126.0, "reward": 0.4521217942237854, "reward_std": 0.07054702937602997, "rewards/gemini_judge_reward_func/mean": 0.1863839328289032, "rewards/gemini_judge_reward_func/std": 0.288268119096756, "rewards/semantic_correctness_reward_func/mean": 0.4935908019542694, "rewards/semantic_correctness_reward_func/std": 0.23850572109222412, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903594970703, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 161.08482360839844, "completions/mean_terminated_length": 145.39544677734375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.3281123298194699, "grad_norm": 0.020122205838561058, "kl": 0.014005899429321289, "learning_rate": 9.486770920760668e-08, "loss": -0.015, "num_tokens": 342363049.0, "reward": 0.4218449890613556, "reward_std": 0.06150934845209122, "rewards/gemini_judge_reward_func/mean": 0.1372767835855484, "rewards/gemini_judge_reward_func/std": 0.2798641622066498, "rewards/semantic_correctness_reward_func/mean": 0.43818897008895874, "rewards/semantic_correctness_reward_func/std": 0.22415503859519958, "rewards/xmlcount_reward_func/mean": 0.698241114616394, "rewards/xmlcount_reward_func/std": 0.4598964750766754, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 181.9241180419922, "completions/mean_terminated_length": 150.73611450195312, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.3284537578421749, "grad_norm": 0.026615537703037262, "kl": 0.02645087242126465, "learning_rate": 9.16888895301199e-08, "loss": 0.0051, "num_tokens": 342726040.0, "reward": 0.43222251534461975, "reward_std": 0.07855530083179474, "rewards/gemini_judge_reward_func/mean": 0.1964285671710968, "rewards/gemini_judge_reward_func/std": 0.29840490221977234, "rewards/semantic_correctness_reward_func/mean": 0.4771032929420471, "rewards/semantic_correctness_reward_func/std": 0.2208947241306305, "rewards/xmlcount_reward_func/mean": 0.6455759406089783, "rewards/xmlcount_reward_func/std": 0.4838610589504242, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 162.92857360839844, "completions/mean_terminated_length": 147.27272033691406, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.3287951858648799, "grad_norm": 0.024194782599806786, "kl": 0.014604568481445312, "learning_rate": 8.856374635655696e-08, "loss": -0.0229, "num_tokens": 343096860.0, "reward": 0.4131552577018738, "reward_std": 0.05632089450955391, "rewards/gemini_judge_reward_func/mean": 0.1205357164144516, "rewards/gemini_judge_reward_func/std": 0.23313553631305695, "rewards/semantic_correctness_reward_func/mean": 0.4044993221759796, "rewards/semantic_correctness_reward_func/std": 0.21371452510356903, "rewards/xmlcount_reward_func/mean": 0.7101027369499207, "rewards/xmlcount_reward_func/std": 0.45508646965026855, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 157.8169708251953, "completions/mean_terminated_length": 138.0410919189453, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.3291366138875848, "grad_norm": 0.020719485357403755, "kl": 0.017660140991210938, "learning_rate": 8.549231386298151e-08, "loss": 0.0174, "num_tokens": 343443715.0, "reward": 0.42168301343917847, "reward_std": 0.05975125730037689, "rewards/gemini_judge_reward_func/mean": 0.1439732164144516, "rewards/gemini_judge_reward_func/std": 0.2661329209804535, "rewards/semantic_correctness_reward_func/mean": 0.46196863055229187, "rewards/semantic_correctness_reward_func/std": 0.20193101465702057, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 159.6741180419922, "completions/mean_terminated_length": 151.88739013671875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.3294780419102898, "grad_norm": 0.020591214299201965, "kl": 0.015319347381591797, "learning_rate": 8.247462563808816e-08, "loss": -0.0066, "num_tokens": 343756414.0, "reward": 0.44394856691360474, "reward_std": 0.059848301112651825, "rewards/gemini_judge_reward_func/mean": 0.1082589253783226, "rewards/gemini_judge_reward_func/std": 0.19306614995002747, "rewards/semantic_correctness_reward_func/mean": 0.43022483587265015, "rewards/semantic_correctness_reward_func/std": 0.2131272703409195, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 166.80804443359375, "completions/mean_terminated_length": 151.22271728515625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.32981946993299477, "grad_norm": 0.02128966711461544, "kl": 0.01815199851989746, "learning_rate": 7.951071468283166e-08, "loss": -0.0017, "num_tokens": 344126527.0, "reward": 0.40433269739151, "reward_std": 0.07133690267801285, "rewards/gemini_judge_reward_func/mean": 0.1127232164144516, "rewards/gemini_judge_reward_func/std": 0.22301679849624634, "rewards/semantic_correctness_reward_func/mean": 0.4734667241573334, "rewards/semantic_correctness_reward_func/std": 0.20783737301826477, "rewards/xmlcount_reward_func/mean": 0.6613750457763672, "rewards/xmlcount_reward_func/std": 0.47500187158584595, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 170.6116180419922, "completions/mean_terminated_length": 151.1278533935547, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.3301608979556997, "grad_norm": 0.01904473453760147, "kl": 0.013091087341308594, "learning_rate": 7.660061341006719e-08, "loss": -0.0302, "num_tokens": 344479128.0, "reward": 0.43435797095298767, "reward_std": 0.05595193803310394, "rewards/gemini_judge_reward_func/mean": 0.125, "rewards/gemini_judge_reward_func/std": 0.25278717279434204, "rewards/semantic_correctness_reward_func/mean": 0.44712895154953003, "rewards/semantic_correctness_reward_func/std": 0.22964619100093842, "rewards/xmlcount_reward_func/mean": 0.737330436706543, "rewards/xmlcount_reward_func/std": 0.439359575510025, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 159.1919708251953, "completions/mean_terminated_length": 151.40090942382812, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.3305023259784047, "grad_norm": 0.01983231119811535, "kl": 0.011693239212036133, "learning_rate": 7.374435364419675e-08, "loss": -0.0318, "num_tokens": 344824143.0, "reward": 0.4040999114513397, "reward_std": 0.048461802303791046, "rewards/gemini_judge_reward_func/mean": 0.1037946417927742, "rewards/gemini_judge_reward_func/std": 0.20393303036689758, "rewards/semantic_correctness_reward_func/mean": 0.41866016387939453, "rewards/semantic_correctness_reward_func/std": 0.20052829384803772, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903892993927, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 152.16519165039062, "completions/mean_terminated_length": 140.330322265625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.33084375400110966, "grad_norm": 0.020666640251874924, "kl": 0.014385223388671875, "learning_rate": 7.094196662081832e-08, "loss": -0.0165, "num_tokens": 345193180.0, "reward": 0.4273673892021179, "reward_std": 0.06679557263851166, "rewards/gemini_judge_reward_func/mean": 0.1417410671710968, "rewards/gemini_judge_reward_func/std": 0.24774518609046936, "rewards/semantic_correctness_reward_func/mean": 0.4233546555042267, "rewards/semantic_correctness_reward_func/std": 0.18689891695976257, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 151.37054443359375, "completions/mean_terminated_length": 147.45741271972656, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.3311851820238146, "grad_norm": 0.020662736147642136, "kl": 0.012186050415039062, "learning_rate": 6.819348298638839e-08, "loss": 0.0109, "num_tokens": 345555479.0, "reward": 0.422772616147995, "reward_std": 0.0710143893957138, "rewards/gemini_judge_reward_func/mean": 0.1194196417927742, "rewards/gemini_judge_reward_func/std": 0.2331113964319229, "rewards/semantic_correctness_reward_func/mean": 0.4271486699581146, "rewards/semantic_correctness_reward_func/std": 0.2294951230287552, "rewards/xmlcount_reward_func/mean": 0.7239375710487366, "rewards/xmlcount_reward_func/std": 0.4488601088523865, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 161.2991180419922, "completions/mean_terminated_length": 149.58824157714844, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.33152661004651957, "grad_norm": 0.01932770572602749, "kl": 0.012654304504394531, "learning_rate": 6.549893279788278e-08, "loss": -0.0002, "num_tokens": 345886710.0, "reward": 0.4679696559906006, "reward_std": 0.0628255307674408, "rewards/gemini_judge_reward_func/mean": 0.1339285671710968, "rewards/gemini_judge_reward_func/std": 0.26455092430114746, "rewards/semantic_correctness_reward_func/mean": 0.44534817337989807, "rewards/semantic_correctness_reward_func/std": 0.19571352005004883, "rewards/xmlcount_reward_func/mean": 0.8133214712142944, "rewards/xmlcount_reward_func/std": 0.3857904076576233, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 172.18751525878906, "completions/mean_terminated_length": 160.6244354248047, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.33186803806922455, "grad_norm": 0.02037588320672512, "kl": 0.011893272399902344, "learning_rate": 6.285834552247127e-08, "loss": -0.0302, "num_tokens": 346223572.0, "reward": 0.4898928105831146, "reward_std": 0.0647522360086441, "rewards/gemini_judge_reward_func/mean": 0.1305803507566452, "rewards/gemini_judge_reward_func/std": 0.22947613894939423, "rewards/semantic_correctness_reward_func/mean": 0.4365532100200653, "rewards/semantic_correctness_reward_func/std": 0.18644456565380096, "rewards/xmlcount_reward_func/mean": 0.8758750557899475, "rewards/xmlcount_reward_func/std": 0.33179107308387756, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 164.3794708251953, "completions/mean_terminated_length": 140.72018432617188, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.3322094660919295, "grad_norm": 0.027246547862887383, "kl": 0.01928424835205078, "learning_rate": 6.027175003719354e-08, "loss": -0.0297, "num_tokens": 346613681.0, "reward": 0.34606069326400757, "reward_std": 0.04251888021826744, "rewards/gemini_judge_reward_func/mean": 0.0491071417927742, "rewards/gemini_judge_reward_func/std": 0.13325557112693787, "rewards/semantic_correctness_reward_func/mean": 0.3785982131958008, "rewards/semantic_correctness_reward_func/std": 0.20065419375896454, "rewards/xmlcount_reward_func/mean": 0.6267456412315369, "rewards/xmlcount_reward_func/std": 0.4845307171344757, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 156.5178680419922, "completions/mean_terminated_length": 136.7123260498047, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.33255089411463445, "grad_norm": 0.02008494734764099, "kl": 0.01699686050415039, "learning_rate": 5.773917462864265e-08, "loss": -0.0347, "num_tokens": 347005969.0, "reward": 0.38510963320732117, "reward_std": 0.051969029009342194, "rewards/gemini_judge_reward_func/mean": 0.0892857164144516, "rewards/gemini_judge_reward_func/std": 0.21270503103733063, "rewards/semantic_correctness_reward_func/mean": 0.38847652077674866, "rewards/semantic_correctness_reward_func/std": 0.1963837593793869, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 1024.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 175.02232360839844, "completions/mean_terminated_length": 143.57870483398438, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.33289232213733944, "grad_norm": 0.019948428496718407, "kl": 0.01439356803894043, "learning_rate": 5.526064699265754e-08, "loss": -0.0112, "num_tokens": 347383882.0, "reward": 0.3975110352039337, "reward_std": 0.06101817265152931, "rewards/gemini_judge_reward_func/mean": 0.1238839253783226, "rewards/gemini_judge_reward_func/std": 0.269942045211792, "rewards/semantic_correctness_reward_func/mean": 0.41703715920448303, "rewards/semantic_correctness_reward_func/std": 0.25031930208206177, "rewards/xmlcount_reward_func/mean": 0.6613750457763672, "rewards/xmlcount_reward_func/std": 0.47500187158584595, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 175.96876525878906, "completions/mean_terminated_length": 156.6072998046875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.33323375016004436, "grad_norm": 0.020137697458267212, "kl": 0.014559745788574219, "learning_rate": 5.2836194234019976e-08, "loss": 0.0183, "num_tokens": 347752771.0, "reward": 0.4365932047367096, "reward_std": 0.07334822416305542, "rewards/gemini_judge_reward_func/mean": 0.1395089328289032, "rewards/gemini_judge_reward_func/std": 0.2611019015312195, "rewards/semantic_correctness_reward_func/mean": 0.4761890470981598, "rewards/semantic_correctness_reward_func/std": 0.20910099148750305, "rewards/xmlcount_reward_func/mean": 0.7138795256614685, "rewards/xmlcount_reward_func/std": 0.452818363904953, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 156.1294708251953, "completions/mean_terminated_length": 144.34841918945312, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.33357517818274934, "grad_norm": 0.018826456740498543, "kl": 0.01526784896850586, "learning_rate": 5.0465842866156965e-08, "loss": -0.0179, "num_tokens": 348123360.0, "reward": 0.41452330350875854, "reward_std": 0.0659627914428711, "rewards/gemini_judge_reward_func/mean": 0.1361607164144516, "rewards/gemini_judge_reward_func/std": 0.24692820012569427, "rewards/semantic_correctness_reward_func/mean": 0.4417950510978699, "rewards/semantic_correctness_reward_func/std": 0.21186378598213196, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 141.6116180419922, "completions/mean_terminated_length": 141.6116180419922, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.3339166062054543, "grad_norm": 0.021350812166929245, "kl": 0.014334440231323242, "learning_rate": 4.8149618810850454e-08, "loss": 0.0309, "num_tokens": 348474477.0, "reward": 0.44549697637557983, "reward_std": 0.07247848808765411, "rewards/gemini_judge_reward_func/mean": 0.1015625, "rewards/gemini_judge_reward_func/std": 0.23199227452278137, "rewards/semantic_correctness_reward_func/mean": 0.41560983657836914, "rewards/semantic_correctness_reward_func/std": 0.2107170671224594, "rewards/xmlcount_reward_func/mean": 0.8043750524520874, "rewards/xmlcount_reward_func/std": 0.3985843360424042, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 177.31251525878906, "completions/mean_terminated_length": 150.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.33425803422815925, "grad_norm": 0.020839158445596695, "kl": 0.013498067855834961, "learning_rate": 4.588754739795587e-08, "loss": -0.0097, "num_tokens": 348839815.0, "reward": 0.43120020627975464, "reward_std": 0.057902269065380096, "rewards/gemini_judge_reward_func/mean": 0.0993303582072258, "rewards/gemini_judge_reward_func/std": 0.197829008102417, "rewards/semantic_correctness_reward_func/mean": 0.4558401107788086, "rewards/semantic_correctness_reward_func/std": 0.19552071392536163, "rewards/xmlcount_reward_func/mean": 0.7507500648498535, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 159.4419708251953, "completions/mean_terminated_length": 147.7058868408203, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.33459946225086423, "grad_norm": 0.0211932510137558, "kl": 0.014786720275878906, "learning_rate": 4.367965336512403e-08, "loss": -0.0091, "num_tokens": 349205298.0, "reward": 0.39420926570892334, "reward_std": 0.061161503195762634, "rewards/gemini_judge_reward_func/mean": 0.1171875, "rewards/gemini_judge_reward_func/std": 0.24705736339092255, "rewards/semantic_correctness_reward_func/mean": 0.37817126512527466, "rewards/semantic_correctness_reward_func/std": 0.22185632586479187, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 175.5491180419922, "completions/mean_terminated_length": 152.19723510742188, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.3349408902735692, "grad_norm": 0.020670127123594284, "kl": 0.01729416847229004, "learning_rate": 4.1525960857530244e-08, "loss": 0.0071, "num_tokens": 349555813.0, "reward": 0.44997820258140564, "reward_std": 0.06321458518505096, "rewards/gemini_judge_reward_func/mean": 0.1584821492433548, "rewards/gemini_judge_reward_func/std": 0.260422945022583, "rewards/semantic_correctness_reward_func/mean": 0.44487300515174866, "rewards/semantic_correctness_reward_func/std": 0.21552570164203644, "rewards/xmlcount_reward_func/mean": 0.7440268397331238, "rewards/xmlcount_reward_func/std": 0.4337250292301178, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 173.38839721679688, "completions/mean_terminated_length": 149.97705078125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.33528231829627414, "grad_norm": 0.0203064177185297, "kl": 0.0137939453125, "learning_rate": 3.9426493427611177e-08, "loss": 0.0006, "num_tokens": 349903064.0, "reward": 0.4368150234222412, "reward_std": 0.061177946627140045, "rewards/gemini_judge_reward_func/mean": 0.1216517835855484, "rewards/gemini_judge_reward_func/std": 0.23194913566112518, "rewards/semantic_correctness_reward_func/mean": 0.4392712414264679, "rewards/semantic_correctness_reward_func/std": 0.22072644531726837, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 166.4107208251953, "completions/mean_terminated_length": 146.83103942871094, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.3356237463189791, "grad_norm": 0.01848675310611725, "kl": 0.012476444244384766, "learning_rate": 3.738127403480507e-08, "loss": -0.0361, "num_tokens": 350265044.0, "reward": 0.4574826657772064, "reward_std": 0.08490362018346786, "rewards/gemini_judge_reward_func/mean": 0.1629464328289032, "rewards/gemini_judge_reward_func/std": 0.2894052565097809, "rewards/semantic_correctness_reward_func/mean": 0.4600202143192291, "rewards/semantic_correctness_reward_func/std": 0.21947798132896423, "rewards/xmlcount_reward_func/mean": 0.7507500648498535, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 1024.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 160.2678680419922, "completions/mean_terminated_length": 148.54299926757812, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.3359651743416841, "grad_norm": 0.018878811970353127, "kl": 0.012603759765625, "learning_rate": 3.5390325045304704e-08, "loss": -0.005, "num_tokens": 350595932.0, "reward": 0.44260725378990173, "reward_std": 0.06230099871754646, "rewards/gemini_judge_reward_func/mean": 0.1127232164144516, "rewards/gemini_judge_reward_func/std": 0.24113184213638306, "rewards/semantic_correctness_reward_func/mean": 0.41458967328071594, "rewards/semantic_correctness_reward_func/std": 0.1906474530696869, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 147.80804443359375, "completions/mean_terminated_length": 143.87893676757812, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.3363066023643891, "grad_norm": 0.020969383418560028, "kl": 0.016061782836914062, "learning_rate": 3.345366823180929e-08, "loss": 0.0002, "num_tokens": 350964901.0, "reward": 0.4793672561645508, "reward_std": 0.06915397942066193, "rewards/gemini_judge_reward_func/mean": 0.1529017835855484, "rewards/gemini_judge_reward_func/std": 0.276716947555542, "rewards/semantic_correctness_reward_func/mean": 0.4465325176715851, "rewards/semantic_correctness_reward_func/std": 0.23200318217277527, "rewards/xmlcount_reward_func/mean": 0.8222500681877136, "rewards/xmlcount_reward_func/std": 0.3842346966266632, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 153.09375, "completions/mean_terminated_length": 145.2477569580078, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.336648030387094, "grad_norm": 0.019626779481768608, "kl": 0.012929677963256836, "learning_rate": 3.1571324773286284e-08, "loss": 0.0086, "num_tokens": 351329478.0, "reward": 0.47190842032432556, "reward_std": 0.09431184083223343, "rewards/gemini_judge_reward_func/mean": 0.1908482164144516, "rewards/gemini_judge_reward_func/std": 0.31251001358032227, "rewards/semantic_correctness_reward_func/mean": 0.507622241973877, "rewards/semantic_correctness_reward_func/std": 0.20726893842220306, "rewards/xmlcount_reward_func/mean": 0.735111653804779, "rewards/xmlcount_reward_func/std": 0.441826730966568, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 164.71429443359375, "completions/mean_terminated_length": 145.09588623046875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.336989458409799, "grad_norm": 0.020227260887622833, "kl": 0.017192602157592773, "learning_rate": 2.9743315254743834e-08, "loss": 0.0006, "num_tokens": 351678310.0, "reward": 0.44850510358810425, "reward_std": 0.07030683010816574, "rewards/gemini_judge_reward_func/mean": 0.1573660671710968, "rewards/gemini_judge_reward_func/std": 0.2792554795742035, "rewards/semantic_correctness_reward_func/mean": 0.46204322576522827, "rewards/semantic_correctness_reward_func/std": 0.20317070186138153, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 164.22769165039062, "completions/mean_terminated_length": 140.564208984375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.337330886432504, "grad_norm": 0.019894586876034737, "kl": 0.016435623168945312, "learning_rate": 2.7969659666999273e-08, "loss": -0.0137, "num_tokens": 352053657.0, "reward": 0.3776443302631378, "reward_std": 0.06739164888858795, "rewards/gemini_judge_reward_func/mean": 0.1417410671710968, "rewards/gemini_judge_reward_func/std": 0.2796315848827362, "rewards/semantic_correctness_reward_func/mean": 0.4249892830848694, "rewards/semantic_correctness_reward_func/std": 0.2107486128807068, "rewards/xmlcount_reward_func/mean": 0.5898750424385071, "rewards/xmlcount_reward_func/std": 0.493558406829834, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 155.8482208251953, "completions/mean_terminated_length": 140.06362915039062, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.3376723144552089, "grad_norm": 0.02049451507627964, "kl": 0.016164541244506836, "learning_rate": 2.625037740646763e-08, "loss": -0.0158, "num_tokens": 352410271.0, "reward": 0.4529190957546234, "reward_std": 0.07489325851202011, "rewards/gemini_judge_reward_func/mean": 0.1607142835855484, "rewards/gemini_judge_reward_func/std": 0.28872138261795044, "rewards/semantic_correctness_reward_func/mean": 0.44166675209999084, "rewards/semantic_correctness_reward_func/std": 0.2282264530658722, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 1024.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 167.32589721679688, "completions/mean_terminated_length": 147.76712036132812, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.3380137424779139, "grad_norm": 0.02016194351017475, "kl": 0.012232065200805664, "learning_rate": 2.4585487274942922e-08, "loss": -0.0117, "num_tokens": 352787360.0, "reward": 0.4089185893535614, "reward_std": 0.056611210107803345, "rewards/gemini_judge_reward_func/mean": 0.1037946417927742, "rewards/gemini_judge_reward_func/std": 0.23580104112625122, "rewards/semantic_correctness_reward_func/mean": 0.4427536427974701, "rewards/semantic_correctness_reward_func/std": 0.2222413569688797, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903594970703, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 157.52679443359375, "completions/mean_terminated_length": 149.72071838378906, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.33835517050061886, "grad_norm": 0.019806455820798874, "kl": 0.014673709869384766, "learning_rate": 2.2975007479397736e-08, "loss": -0.0125, "num_tokens": 353139082.0, "reward": 0.415515273809433, "reward_std": 0.05204417183995247, "rewards/gemini_judge_reward_func/mean": 0.1071428582072258, "rewards/gemini_judge_reward_func/std": 0.1915077120065689, "rewards/semantic_correctness_reward_func/mean": 0.39754053950309753, "rewards/semantic_correctness_reward_func/std": 0.1915198266506195, "rewards/xmlcount_reward_func/mean": 0.7328749895095825, "rewards/xmlcount_reward_func/std": 0.44427838921546936, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 160.17857360839844, "completions/mean_terminated_length": 136.40367126464844, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.3386965985233238, "grad_norm": 0.021138962358236313, "kl": 0.014227151870727539, "learning_rate": 2.1418955631781203e-08, "loss": -0.0001, "num_tokens": 353492886.0, "reward": 0.406089186668396, "reward_std": 0.05585183575749397, "rewards/gemini_judge_reward_func/mean": 0.15625, "rewards/gemini_judge_reward_func/std": 0.28237661719322205, "rewards/semantic_correctness_reward_func/mean": 0.4666958451271057, "rewards/semantic_correctness_reward_func/std": 0.20080603659152985, "rewards/xmlcount_reward_func/mean": 0.6256250739097595, "rewards/xmlcount_reward_func/std": 0.48569241166114807, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 171.7678680419922, "completions/mean_terminated_length": 148.31192016601562, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.33903802654602877, "grad_norm": 0.019860416650772095, "kl": 0.014904022216796875, "learning_rate": 1.9917348748826337e-08, "loss": 0.0035, "num_tokens": 353868734.0, "reward": 0.41911885142326355, "reward_std": 0.06615443527698517, "rewards/gemini_judge_reward_func/mean": 0.1294642835855484, "rewards/gemini_judge_reward_func/std": 0.2603941261768341, "rewards/semantic_correctness_reward_func/mean": 0.4424155652523041, "rewards/semantic_correctness_reward_func/std": 0.21090011298656464, "rewards/xmlcount_reward_func/mean": 0.6971250176429749, "rewards/xmlcount_reward_func/std": 0.4612903892993927, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 158.16964721679688, "completions/mean_terminated_length": 142.42726135253906, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.33937945456873375, "grad_norm": 0.02125832624733448, "kl": 0.01460719108581543, "learning_rate": 1.847020325186577e-08, "loss": -0.018, "num_tokens": 354216172.0, "reward": 0.47000354528427124, "reward_std": 0.07803654670715332, "rewards/gemini_judge_reward_func/mean": 0.1607142835855484, "rewards/gemini_judge_reward_func/std": 0.2633373737335205, "rewards/semantic_correctness_reward_func/mean": 0.45558905601501465, "rewards/semantic_correctness_reward_func/std": 0.20520326495170593, "rewards/xmlcount_reward_func/mean": 0.7865000367164612, "rewards/xmlcount_reward_func/std": 0.41165614128112793, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 1024.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 162.82589721679688, "completions/mean_terminated_length": 147.16818237304688, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.3397208825914387, "grad_norm": 0.02087085321545601, "kl": 0.013870716094970703, "learning_rate": 1.7077534966650767e-08, "loss": 0.0017, "num_tokens": 354569773.0, "reward": 0.4318699240684509, "reward_std": 0.055673111230134964, "rewards/gemini_judge_reward_func/mean": 0.1183035746216774, "rewards/gemini_judge_reward_func/std": 0.24595339596271515, "rewards/semantic_correctness_reward_func/mean": 0.4418494701385498, "rewards/semantic_correctness_reward_func/std": 0.21151991188526154, "rewards/xmlcount_reward_func/mean": 0.7404464483261108, "rewards/xmlcount_reward_func/std": 0.43912947177886963, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 149.60714721679688, "completions/mean_terminated_length": 145.6861114501953, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.34006231061414366, "grad_norm": 0.022532809525728226, "kl": 0.01392984390258789, "learning_rate": 1.5739359123178587e-08, "loss": -0.003, "num_tokens": 354903629.0, "reward": 0.47908294200897217, "reward_std": 0.06745254993438721, "rewards/gemini_judge_reward_func/mean": 0.1674107164144516, "rewards/gemini_judge_reward_func/std": 0.30849042534828186, "rewards/semantic_correctness_reward_func/mean": 0.4438968300819397, "rewards/semantic_correctness_reward_func/std": 0.23170118033885956, "rewards/xmlcount_reward_func/mean": 0.8083482384681702, "rewards/xmlcount_reward_func/std": 0.39272549748420715, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 166.58929443359375, "completions/mean_terminated_length": 142.99081420898438, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.34040373863684864, "grad_norm": 0.02883129194378853, "kl": 0.013774394989013672, "learning_rate": 1.4455690355525964e-08, "loss": -0.0131, "num_tokens": 355264021.0, "reward": 0.40281856060028076, "reward_std": 0.05309184268116951, "rewards/gemini_judge_reward_func/mean": 0.1183035746216774, "rewards/gemini_judge_reward_func/std": 0.23187628388404846, "rewards/semantic_correctness_reward_func/mean": 0.41898536682128906, "rewards/semantic_correctness_reward_func/std": 0.20515595376491547, "rewards/xmlcount_reward_func/mean": 0.6792500615119934, "rewards/xmlcount_reward_func/std": 0.46853893995285034, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 1024.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 152.8928680419922, "completions/mean_terminated_length": 145.0450439453125, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.34074516665955357, "grad_norm": 0.020875928923487663, "kl": 0.014181137084960938, "learning_rate": 1.3226542701689215e-08, "loss": -0.0043, "num_tokens": 355602493.0, "reward": 0.45096153020858765, "reward_std": 0.0548894889652729, "rewards/gemini_judge_reward_func/mean": 0.1506696492433548, "rewards/gemini_judge_reward_func/std": 0.27895063161849976, "rewards/semantic_correctness_reward_func/mean": 0.451968252658844, "rewards/semantic_correctness_reward_func/std": 0.22784963250160217, "rewards/xmlcount_reward_func/mean": 0.7507500052452087, "rewards/xmlcount_reward_func/std": 0.4344164729118347, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 1024.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 145.7991180419922, "completions/mean_terminated_length": 141.86099243164062, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.34108659468225855, "grad_norm": 0.02083824761211872, "kl": 0.015220165252685547, "learning_rate": 1.2051929603428824e-08, "loss": 0.0042, "num_tokens": 355963108.0, "reward": 0.4503902792930603, "reward_std": 0.06350675225257874, "rewards/gemini_judge_reward_func/mean": 0.1584821492433548, "rewards/gemini_judge_reward_func/std": 0.27303215861320496, "rewards/semantic_correctness_reward_func/mean": 0.4737100899219513, "rewards/semantic_correctness_reward_func/std": 0.23610134422779083, "rewards/xmlcount_reward_func/mean": 0.7306384444236755, "rewards/xmlcount_reward_func/std": 0.4441836178302765, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 1024.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 161.3794708251953, "completions/mean_terminated_length": 137.63760375976562, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.34142802270496353, "grad_norm": 0.021282846108078957, "kl": 0.015776872634887695, "learning_rate": 1.0931863906127327e-08, "loss": 0.0138, "num_tokens": 356339045.0, "reward": 0.41461434960365295, "reward_std": 0.05614163354039192, "rewards/gemini_judge_reward_func/mean": 0.1138392835855484, "rewards/gemini_judge_reward_func/std": 0.24806062877178192, "rewards/semantic_correctness_reward_func/mean": 0.4153929352760315, "rewards/semantic_correctness_reward_func/std": 0.21967321634292603, "rewards/xmlcount_reward_func/mean": 0.7150000333786011, "rewards/xmlcount_reward_func/std": 0.4532184898853302, "step": 1000 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 356339045, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }