{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.34142802270496353,
  "eval_steps": 500,
  "global_step": 1000,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 270.0,
      "completions/mean_length": 141.27679443359375,
      "completions/mean_terminated_length": 133.32432556152344,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.0003414280227049635,
      "grad_norm": 0.0,
      "kl": 0.0,
      "learning_rate": 0.0,
      "loss": -0.0458,
      "num_tokens": 379814.0,
      "reward": 0.40348002314567566,
      "reward_std": 0.06271512806415558,
      "rewards/gemini_judge_reward_func/mean": 0.109375,
      "rewards/gemini_judge_reward_func/std": 0.21642543375492096,
      "rewards/semantic_correctness_reward_func/mean": 0.42229294776916504,
      "rewards/semantic_correctness_reward_func/std": 0.2194633036851883,
      "rewards/xmlcount_reward_func/mean": 0.6881785988807678,
      "rewards/xmlcount_reward_func/std": 0.46016210317611694,
      "step": 1
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 287.0,
      "completions/max_terminated_length": 287.0,
      "completions/mean_length": 137.9107208251953,
      "completions/mean_terminated_length": 137.9107208251953,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.000682856045409927,
      "grad_norm": 0.0,
      "kl": 0.0,
      "learning_rate": 0.0,
      "loss": -0.0198,
      "num_tokens": 720774.0,
      "reward": 0.4278814196586609,
      "reward_std": 0.056292574852705,
      "rewards/gemini_judge_reward_func/mean": 0.1160714253783226,
      "rewards/gemini_judge_reward_func/std": 0.22691282629966736,
      "rewards/semantic_correctness_reward_func/mean": 0.39234450459480286,
      "rewards/semantic_correctness_reward_func/std": 0.2054908126592636,
      "rewards/xmlcount_reward_func/mean": 0.7574599385261536,
      "rewards/xmlcount_reward_func/std": 0.4265315532684326,
      "step": 2
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 338.0,
      "completions/max_terminated_length": 338.0,
      "completions/mean_length": 126.82589721679688,
      "completions/mean_terminated_length": 126.82589721679688,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.0010242840681148906,
      "grad_norm": 0.0,
      "kl": 0.0,
      "learning_rate": 0.0,
      "loss": -0.0051,
      "num_tokens": 1091335.0,
      "reward": 0.41104522347450256,
      "reward_std": 0.05016090348362923,
      "rewards/gemini_judge_reward_func/mean": 0.1395089328289032,
      "rewards/gemini_judge_reward_func/std": 0.25786155462265015,
      "rewards/semantic_correctness_reward_func/mean": 0.4177080988883972,
      "rewards/semantic_correctness_reward_func/std": 0.2196023315191269,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 3
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1004.0,
      "completions/mean_length": 156.24554443359375,
      "completions/mean_terminated_length": 148.4279327392578,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.001365712090819854,
      "grad_norm": 0.0,
      "kl": 0.0,
      "learning_rate": 0.0,
      "loss": -0.0358,
      "num_tokens": 1433778.0,
      "reward": 0.4309219717979431,
      "reward_std": 0.07280989736318588,
      "rewards/gemini_judge_reward_func/mean": 0.1127232164144516,
      "rewards/gemini_judge_reward_func/std": 0.22798825800418854,
      "rewards/semantic_correctness_reward_func/mean": 0.4097882807254791,
      "rewards/semantic_correctness_reward_func/std": 0.2280801683664322,
      "rewards/xmlcount_reward_func/mean": 0.7596875429153442,
      "rewards/xmlcount_reward_func/std": 0.4291202127933502,
      "step": 4
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 597.0,
      "completions/max_terminated_length": 597.0,
      "completions/mean_length": 151.125,
      "completions/mean_terminated_length": 151.125,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.0017071401135248176,
      "grad_norm": 0.0,
      "kl": 0.0,
      "learning_rate": 0.0,
      "loss": -0.0083,
      "num_tokens": 1800166.0,
      "reward": 0.44322600960731506,
      "reward_std": 0.060560885816812515,
      "rewards/gemini_judge_reward_func/mean": 0.1316964328289032,
      "rewards/gemini_judge_reward_func/std": 0.25926730036735535,
      "rewards/semantic_correctness_reward_func/mean": 0.4244246482849121,
      "rewards/semantic_correctness_reward_func/std": 0.21751059591770172,
      "rewards/xmlcount_reward_func/mean": 0.7641563415527344,
      "rewards/xmlcount_reward_func/std": 0.4263768792152405,
      "step": 5
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 380.0,
      "completions/mean_length": 157.68304443359375,
      "completions/mean_terminated_length": 145.92308044433594,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.002048568136229781,
      "grad_norm": 0.0,
      "kl": 0.0,
      "learning_rate": 0.0,
      "loss": -0.0095,
      "num_tokens": 2137347.0,
      "reward": 0.4433988034725189,
      "reward_std": 0.07498325407505035,
      "rewards/gemini_judge_reward_func/mean": 0.1450892835855484,
      "rewards/gemini_judge_reward_func/std": 0.27536848187446594,
      "rewards/semantic_correctness_reward_func/mean": 0.42084214091300964,
      "rewards/semantic_correctness_reward_func/std": 0.23131638765335083,
      "rewards/xmlcount_reward_func/mean": 0.7529866099357605,
      "rewards/xmlcount_reward_func/std": 0.4318158030509949,
      "step": 6
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 928.0,
      "completions/mean_length": 131.30804443359375,
      "completions/mean_terminated_length": 127.30493927001953,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.0023899961589347444,
      "grad_norm": 0.0,
      "kl": 0.0,
      "learning_rate": 0.0,
      "loss": -0.0101,
      "num_tokens": 2528988.0,
      "reward": 0.3721596896648407,
      "reward_std": 0.06552143394947052,
      "rewards/gemini_judge_reward_func/mean": 0.1439732164144516,
      "rewards/gemini_judge_reward_func/std": 0.25538474321365356,
      "rewards/semantic_correctness_reward_func/mean": 0.4288518726825714,
      "rewards/semantic_correctness_reward_func/std": 0.22205649316310883,
      "rewards/xmlcount_reward_func/mean": 0.5720000267028809,
      "rewards/xmlcount_reward_func/std": 0.4964759945869446,
      "step": 7
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 624.0,
      "completions/max_terminated_length": 624.0,
      "completions/mean_length": 132.20089721679688,
      "completions/mean_terminated_length": 132.20089721679688,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.002731424181639708,
      "grad_norm": 0.0,
      "kl": 0.0,
      "learning_rate": 0.0,
      "loss": -0.0009,
      "num_tokens": 2903613.0,
      "reward": 0.4032416045665741,
      "reward_std": 0.06853938102722168,
      "rewards/gemini_judge_reward_func/mean": 0.1294642835855484,
      "rewards/gemini_judge_reward_func/std": 0.24140411615371704,
      "rewards/semantic_correctness_reward_func/mean": 0.43676143884658813,
      "rewards/semantic_correctness_reward_func/std": 0.22384540736675262,
      "rewards/xmlcount_reward_func/mean": 0.660258948802948,
      "rewards/xmlcount_reward_func/std": 0.47449371218681335,
      "step": 8
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 294.0,
      "completions/mean_length": 141.95982360839844,
      "completions/mean_terminated_length": 138.00448608398438,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.0030728522043446714,
      "grad_norm": 0.0,
      "kl": 0.0,
      "learning_rate": 0.0,
      "loss": -0.0041,
      "num_tokens": 3262084.0,
      "reward": 0.4193665385246277,
      "reward_std": 0.06024722009897232,
      "rewards/gemini_judge_reward_func/mean": 0.0904017835855484,
      "rewards/gemini_judge_reward_func/std": 0.2089034467935562,
      "rewards/semantic_correctness_reward_func/mean": 0.4145289957523346,
      "rewards/semantic_correctness_reward_func/std": 0.1940658688545227,
      "rewards/xmlcount_reward_func/mean": 0.7507500648498535,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 9
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 757.0,
      "completions/mean_length": 151.4866180419922,
      "completions/mean_terminated_length": 135.6227264404297,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.003414280227049635,
      "grad_norm": 0.0,
      "kl": 0.0,
      "learning_rate": 0.0,
      "loss": 0.0118,
      "num_tokens": 3636097.0,
      "reward": 0.3932708203792572,
      "reward_std": 0.0754866749048233,
      "rewards/gemini_judge_reward_func/mean": 0.15625,
      "rewards/gemini_judge_reward_func/std": 0.2853386700153351,
      "rewards/semantic_correctness_reward_func/mean": 0.43835392594337463,
      "rewards/semantic_correctness_reward_func/std": 0.2281491756439209,
      "rewards/xmlcount_reward_func/mean": 0.6077500581741333,
      "rewards/xmlcount_reward_func/std": 0.48996880650520325,
      "step": 10
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 294.0,
      "completions/max_terminated_length": 294.0,
      "completions/mean_length": 130.58482360839844,
      "completions/mean_terminated_length": 130.58482360839844,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.0037557082497545985,
      "grad_norm": 0.039375144988298416,
      "kl": 0.0,
      "learning_rate": 2.0000000000000002e-07,
      "loss": -0.0231,
      "num_tokens": 3973920.0,
      "reward": 0.4216010272502899,
      "reward_std": 0.07699327915906906,
      "rewards/gemini_judge_reward_func/mean": 0.1171875,
      "rewards/gemini_judge_reward_func/std": 0.26352497935295105,
      "rewards/semantic_correctness_reward_func/mean": 0.4078800678253174,
      "rewards/semantic_correctness_reward_func/std": 0.21701829135417938,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 11
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 931.0,
      "completions/mean_length": 163.13839721679688,
      "completions/mean_terminated_length": 143.4840087890625,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.004097136272459562,
      "grad_norm": 0.027680950239300728,
      "kl": 0.0,
      "learning_rate": 4.0000000000000003e-07,
      "loss": -0.0192,
      "num_tokens": 4325419.0,
      "reward": 0.38460925221443176,
      "reward_std": 0.05896308273077011,
      "rewards/gemini_judge_reward_func/mean": 0.0959821417927742,
      "rewards/gemini_judge_reward_func/std": 0.2214544713497162,
      "rewards/semantic_correctness_reward_func/mean": 0.37258180975914,
      "rewards/semantic_correctness_reward_func/std": 0.18282517790794373,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853896975517273,
      "step": 12
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 402.0,
      "completions/max_terminated_length": 402.0,
      "completions/mean_length": 151.97769165039062,
      "completions/mean_terminated_length": 151.97769165039062,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.0044385642951645255,
      "grad_norm": 0.02707161009311676,
      "kl": 1.8555670976638794e-05,
      "learning_rate": 6.000000000000001e-07,
      "loss": -0.0014,
      "num_tokens": 4651218.0,
      "reward": 0.4828442335128784,
      "reward_std": 0.07185830920934677,
      "rewards/gemini_judge_reward_func/mean": 0.1149553582072258,
      "rewards/gemini_judge_reward_func/std": 0.22932343184947968,
      "rewards/semantic_correctness_reward_func/mean": 0.43256044387817383,
      "rewards/semantic_correctness_reward_func/std": 0.20364603400230408,
      "rewards/xmlcount_reward_func/mean": 0.8758750557899475,
      "rewards/xmlcount_reward_func/std": 0.33179107308387756,
      "step": 13
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 860.0,
      "completions/mean_length": 152.73214721679688,
      "completions/mean_terminated_length": 144.8828887939453,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.004779992317869489,
      "grad_norm": 0.0345768965780735,
      "kl": 1.5214085578918457e-05,
      "learning_rate": 8.000000000000001e-07,
      "loss": -0.015,
      "num_tokens": 4993970.0,
      "reward": 0.42273515462875366,
      "reward_std": 0.06252222508192062,
      "rewards/gemini_judge_reward_func/mean": 0.09375,
      "rewards/gemini_judge_reward_func/std": 0.1822412759065628,
      "rewards/semantic_correctness_reward_func/mean": 0.4202113747596741,
      "rewards/semantic_correctness_reward_func/std": 0.2077675461769104,
      "rewards/xmlcount_reward_func/mean": 0.7529821395874023,
      "rewards/xmlcount_reward_func/std": 0.43116891384124756,
      "step": 14
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 446.0,
      "completions/mean_length": 134.96429443359375,
      "completions/mean_terminated_length": 130.9775848388672,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.005121420340574453,
      "grad_norm": 0.027755815535783768,
      "kl": 1.3284385204315186e-05,
      "learning_rate": 1.0000000000000002e-06,
      "loss": -0.0036,
      "num_tokens": 5346350.0,
      "reward": 0.4308743476867676,
      "reward_std": 0.06485090404748917,
      "rewards/gemini_judge_reward_func/mean": 0.1261160671710968,
      "rewards/gemini_judge_reward_func/std": 0.22584888339042664,
      "rewards/semantic_correctness_reward_func/mean": 0.4319072663784027,
      "rewards/semantic_correctness_reward_func/std": 0.19891038537025452,
      "rewards/xmlcount_reward_func/mean": 0.7351161241531372,
      "rewards/xmlcount_reward_func/std": 0.44118446111679077,
      "step": 15
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 150.21429443359375,
      "completions/mean_terminated_length": 146.2959747314453,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.005462848363279416,
      "grad_norm": 0.027755815535783768,
      "kl": 1.9583851099014282e-05,
      "learning_rate": 1.0000000000000002e-06,
      "loss": -0.0088,
      "num_tokens": 5673342.0,
      "reward": 0.4351733922958374,
      "reward_std": 0.06109142303466797,
      "rewards/gemini_judge_reward_func/mean": 0.1183035746216774,
      "rewards/gemini_judge_reward_func/std": 0.21817582845687866,
      "rewards/semantic_correctness_reward_func/mean": 0.437759667634964,
      "rewards/semantic_correctness_reward_func/std": 0.19619369506835938,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 16
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 384.0,
      "completions/mean_length": 144.22769165039062,
      "completions/mean_terminated_length": 136.3018035888672,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.00580427638598438,
      "grad_norm": 0.035458628088235855,
      "kl": 1.4953315258026123e-05,
      "learning_rate": 1.2000000000000002e-06,
      "loss": -0.0095,
      "num_tokens": 6053241.0,
      "reward": 0.4043574631214142,
      "reward_std": 0.07264947146177292,
      "rewards/gemini_judge_reward_func/mean": 0.1328125,
      "rewards/gemini_judge_reward_func/std": 0.24477799236774445,
      "rewards/semantic_correctness_reward_func/mean": 0.41257286071777344,
      "rewards/semantic_correctness_reward_func/std": 0.22099269926548004,
      "rewards/xmlcount_reward_func/mean": 0.671794593334198,
      "rewards/xmlcount_reward_func/std": 0.46925294399261475,
      "step": 17
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 605.0,
      "completions/mean_length": 145.47769165039062,
      "completions/mean_terminated_length": 141.53811645507812,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.006145704408689343,
      "grad_norm": 0.03304464370012283,
      "kl": 1.3154000043869019e-05,
      "learning_rate": 1.4000000000000001e-06,
      "loss": -0.0373,
      "num_tokens": 6402224.0,
      "reward": 0.4680355191230774,
      "reward_std": 0.06361004710197449,
      "rewards/gemini_judge_reward_func/mean": 0.1517857164144516,
      "rewards/gemini_judge_reward_func/std": 0.29065632820129395,
      "rewards/semantic_correctness_reward_func/mean": 0.4278559684753418,
      "rewards/semantic_correctness_reward_func/std": 0.2081516683101654,
      "rewards/xmlcount_reward_func/mean": 0.8043750524520874,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 18
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 274.0,
      "completions/mean_length": 147.27679443359375,
      "completions/mean_terminated_length": 143.3452911376953,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.006487132431394307,
      "grad_norm": 0.031187007203698158,
      "kl": 1.9073486328125e-05,
      "learning_rate": 1.6000000000000001e-06,
      "loss": -0.0048,
      "num_tokens": 6735918.0,
      "reward": 0.4452937841415405,
      "reward_std": 0.07176318019628525,
      "rewards/gemini_judge_reward_func/mean": 0.1116071417927742,
      "rewards/gemini_judge_reward_func/std": 0.22669215500354767,
      "rewards/semantic_correctness_reward_func/mean": 0.38559380173683167,
      "rewards/semantic_correctness_reward_func/std": 0.2231680005788803,
      "rewards/xmlcount_reward_func/mean": 0.8088303804397583,
      "rewards/xmlcount_reward_func/std": 0.3893822133541107,
      "step": 19
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 260.0,
      "completions/mean_length": 157.04019165039062,
      "completions/mean_terminated_length": 141.2772674560547,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.00682856045409927,
      "grad_norm": 0.0283343568444252,
      "kl": 1.8853694200515747e-05,
      "learning_rate": 1.8000000000000001e-06,
      "loss": -0.0294,
      "num_tokens": 7075515.0,
      "reward": 0.413688063621521,
      "reward_std": 0.05096305534243584,
      "rewards/gemini_judge_reward_func/mean": 0.0814732164144516,
      "rewards/gemini_judge_reward_func/std": 0.19466669857501984,
      "rewards/semantic_correctness_reward_func/mean": 0.4039938151836395,
      "rewards/semantic_correctness_reward_func/std": 0.18301716446876526,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 20
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 729.0,
      "completions/mean_length": 141.3303680419922,
      "completions/mean_terminated_length": 133.37838745117188,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.007169988476804234,
      "grad_norm": 0.030031763017177582,
      "kl": 3.3717602491378784e-05,
      "learning_rate": 2.0000000000000003e-06,
      "loss": -0.0204,
      "num_tokens": 7425837.0,
      "reward": 0.38512933254241943,
      "reward_std": 0.07402481883764267,
      "rewards/gemini_judge_reward_func/mean": 0.1216517835855484,
      "rewards/gemini_judge_reward_func/std": 0.2437330186367035,
      "rewards/semantic_correctness_reward_func/mean": 0.36952146887779236,
      "rewards/semantic_correctness_reward_func/std": 0.2271348237991333,
      "rewards/xmlcount_reward_func/mean": 0.6564107537269592,
      "rewards/xmlcount_reward_func/std": 0.5066681504249573,
      "step": 21
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 278.0,
      "completions/mean_length": 142.75894165039062,
      "completions/mean_terminated_length": 138.8071746826172,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.007511416499509197,
      "grad_norm": 0.028901347890496254,
      "kl": 2.1755695343017578e-05,
      "learning_rate": 2.2e-06,
      "loss": 0.0354,
      "num_tokens": 7767839.0,
      "reward": 0.42108339071273804,
      "reward_std": 0.06260724365711212,
      "rewards/gemini_judge_reward_func/mean": 0.1082589253783226,
      "rewards/gemini_judge_reward_func/std": 0.2214629352092743,
      "rewards/semantic_correctness_reward_func/mean": 0.41428306698799133,
      "rewards/semantic_correctness_reward_func/std": 0.20763908326625824,
      "rewards/xmlcount_reward_func/mean": 0.7373080849647522,
      "rewards/xmlcount_reward_func/std": 0.43889865279197693,
      "step": 22
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 316.0,
      "completions/mean_length": 150.8303680419922,
      "completions/mean_terminated_length": 142.96397399902344,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.00785284452221416,
      "grad_norm": 0.026742972433567047,
      "kl": 2.8714537620544434e-05,
      "learning_rate": 2.4000000000000003e-06,
      "loss": -0.0139,
      "num_tokens": 8112905.0,
      "reward": 0.43243101239204407,
      "reward_std": 0.05913807824254036,
      "rewards/gemini_judge_reward_func/mean": 0.0870535746216774,
      "rewards/gemini_judge_reward_func/std": 0.17304261028766632,
      "rewards/semantic_correctness_reward_func/mean": 0.42398518323898315,
      "rewards/semantic_correctness_reward_func/std": 0.1769956350326538,
      "rewards/xmlcount_reward_func/mean": 0.7820313572883606,
      "rewards/xmlcount_reward_func/std": 0.41473883390426636,
      "step": 23
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 836.0,
      "completions/mean_length": 154.7232208251953,
      "completions/mean_terminated_length": 146.8918914794922,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.008194272544919124,
      "grad_norm": 0.03460094705224037,
      "kl": 7.3261559009552e-05,
      "learning_rate": 2.6e-06,
      "loss": -0.006,
      "num_tokens": 8467131.0,
      "reward": 0.3985432982444763,
      "reward_std": 0.05673561245203018,
      "rewards/gemini_judge_reward_func/mean": 0.1015625,
      "rewards/gemini_judge_reward_func/std": 0.21178245544433594,
      "rewards/semantic_correctness_reward_func/mean": 0.39980557560920715,
      "rewards/semantic_correctness_reward_func/std": 0.2231663316488266,
      "rewards/xmlcount_reward_func/mean": 0.6948928833007812,
      "rewards/xmlcount_reward_func/std": 0.4610230326652527,
      "step": 24
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 943.0,
      "completions/mean_length": 146.30357360839844,
      "completions/mean_terminated_length": 138.39639282226562,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.008535700567624089,
      "grad_norm": 0.03418440371751785,
      "kl": 0.00016423314809799194,
      "learning_rate": 2.8000000000000003e-06,
      "loss": -0.0237,
      "num_tokens": 8838515.0,
      "reward": 0.39606812596321106,
      "reward_std": 0.07865350693464279,
      "rewards/gemini_judge_reward_func/mean": 0.1551339328289032,
      "rewards/gemini_judge_reward_func/std": 0.2912905812263489,
      "rewards/semantic_correctness_reward_func/mean": 0.39453673362731934,
      "rewards/semantic_correctness_reward_func/std": 0.22986909747123718,
      "rewards/xmlcount_reward_func/mean": 0.6377679109573364,
      "rewards/xmlcount_reward_func/std": 0.48144102096557617,
      "step": 25
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 425.0,
      "completions/max_terminated_length": 425.0,
      "completions/mean_length": 155.25,
      "completions/mean_terminated_length": 155.25,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.008877128590329051,
      "grad_norm": 0.025704992935061455,
      "kl": 0.00011079013347625732,
      "learning_rate": 3e-06,
      "loss": -0.011,
      "num_tokens": 9173275.0,
      "reward": 0.45401424169540405,
      "reward_std": 0.06545478105545044,
      "rewards/gemini_judge_reward_func/mean": 0.1071428582072258,
      "rewards/gemini_judge_reward_func/std": 0.2276175171136856,
      "rewards/semantic_correctness_reward_func/mean": 0.4121246933937073,
      "rewards/semantic_correctness_reward_func/std": 0.19307947158813477,
      "rewards/xmlcount_reward_func/mean": 0.8218303918838501,
      "rewards/xmlcount_reward_func/std": 0.3807325065135956,
      "step": 26
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 333.0,
      "completions/mean_length": 157.77679443359375,
      "completions/mean_terminated_length": 142.0272674560547,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.009218556613034015,
      "grad_norm": 0.03233994543552399,
      "kl": 0.00011872127652168274,
      "learning_rate": 3.2000000000000003e-06,
      "loss": -0.0147,
      "num_tokens": 9514313.0,
      "reward": 0.41901880502700806,
      "reward_std": 0.05425465106964111,
      "rewards/gemini_judge_reward_func/mean": 0.1004464253783226,
      "rewards/gemini_judge_reward_func/std": 0.2269900143146515,
      "rewards/semantic_correctness_reward_func/mean": 0.3894062638282776,
      "rewards/semantic_correctness_reward_func/std": 0.21159562468528748,
      "rewards/xmlcount_reward_func/mean": 0.7523974180221558,
      "rewards/xmlcount_reward_func/std": 0.43225109577178955,
      "step": 27
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 682.0,
      "completions/mean_length": 143.82589721679688,
      "completions/mean_terminated_length": 139.87893676757812,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.009559984635738978,
      "grad_norm": 0.030443880707025528,
      "kl": 0.00025102123618125916,
      "learning_rate": 3.4000000000000005e-06,
      "loss": 0.0046,
      "num_tokens": 9857054.0,
      "reward": 0.4207466244697571,
      "reward_std": 0.04802559316158295,
      "rewards/gemini_judge_reward_func/mean": 0.0792410746216774,
      "rewards/gemini_judge_reward_func/std": 0.16614827513694763,
      "rewards/semantic_correctness_reward_func/mean": 0.37225088477134705,
      "rewards/semantic_correctness_reward_func/std": 0.17117980122566223,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 28
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 751.0,
      "completions/max_terminated_length": 751.0,
      "completions/mean_length": 128.8794708251953,
      "completions/mean_terminated_length": 128.8794708251953,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.009901412658443942,
      "grad_norm": 0.039184801280498505,
      "kl": 0.0005348548293113708,
      "learning_rate": 3.6000000000000003e-06,
      "loss": -0.0665,
      "num_tokens": 10256135.0,
      "reward": 0.3793807625770569,
      "reward_std": 0.05964759737253189,
      "rewards/gemini_judge_reward_func/mean": 0.1339285671710968,
      "rewards/gemini_judge_reward_func/std": 0.2697960138320923,
      "rewards/semantic_correctness_reward_func/mean": 0.41354653239250183,
      "rewards/semantic_correctness_reward_func/std": 0.2386574149131775,
      "rewards/xmlcount_reward_func/mean": 0.6077500581741333,
      "rewards/xmlcount_reward_func/std": 0.48996880650520325,
      "step": 29
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 273.0,
      "completions/mean_length": 147.1607208251953,
      "completions/mean_terminated_length": 143.2287139892578,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.010242840681148906,
      "grad_norm": 0.026800105348229408,
      "kl": 0.00027988851070404053,
      "learning_rate": 3.8000000000000005e-06,
      "loss": -0.0072,
      "num_tokens": 10580663.0,
      "reward": 0.45957663655281067,
      "reward_std": 0.06587394326925278,
      "rewards/gemini_judge_reward_func/mean": 0.1026785746216774,
      "rewards/gemini_judge_reward_func/std": 0.20789778232574463,
      "rewards/semantic_correctness_reward_func/mean": 0.41227564215660095,
      "rewards/semantic_correctness_reward_func/std": 0.19964328408241272,
      "rewards/xmlcount_reward_func/mean": 0.8401250839233398,
      "rewards/xmlcount_reward_func/std": 0.3684578835964203,
      "step": 30
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 482.0,
      "completions/mean_length": 137.16519165039062,
      "completions/mean_terminated_length": 133.1883544921875,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.010584268703853868,
      "grad_norm": 0.028585907071828842,
      "kl": 0.0008361563086509705,
      "learning_rate": 4.000000000000001e-06,
      "loss": -0.0323,
      "num_tokens": 10931200.0,
      "reward": 0.40794217586517334,
      "reward_std": 0.060390252619981766,
      "rewards/gemini_judge_reward_func/mean": 0.1194196417927742,
      "rewards/gemini_judge_reward_func/std": 0.2207612693309784,
      "rewards/semantic_correctness_reward_func/mean": 0.44237130880355835,
      "rewards/semantic_correctness_reward_func/std": 0.18124501407146454,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 31
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 613.0,
      "completions/mean_length": 139.8303680419922,
      "completions/mean_terminated_length": 135.865478515625,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.010925696726558833,
      "grad_norm": 0.029551656916737556,
      "kl": 0.0011077597737312317,
      "learning_rate": 4.2000000000000004e-06,
      "loss": -0.0086,
      "num_tokens": 11283158.0,
      "reward": 0.3982951045036316,
      "reward_std": 0.051123134791851044,
      "rewards/gemini_judge_reward_func/mean": 0.1116071417927742,
      "rewards/gemini_judge_reward_func/std": 0.2375587821006775,
      "rewards/semantic_correctness_reward_func/mean": 0.4276362359523773,
      "rewards/semantic_correctness_reward_func/std": 0.23077288269996643,
      "rewards/xmlcount_reward_func/mean": 0.6703125238418579,
      "rewards/xmlcount_reward_func/std": 0.4718664884567261,
      "step": 32
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 302.0,
      "completions/max_terminated_length": 302.0,
      "completions/mean_length": 132.13839721679688,
      "completions/mean_terminated_length": 132.13839721679688,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.011267124749263795,
      "grad_norm": 0.02802436798810959,
      "kl": 0.001424439251422882,
      "learning_rate": 4.4e-06,
      "loss": -0.0199,
      "num_tokens": 11632649.0,
      "reward": 0.4111942648887634,
      "reward_std": 0.05410204827785492,
      "rewards/gemini_judge_reward_func/mean": 0.0959821417927742,
      "rewards/gemini_judge_reward_func/std": 0.20018360018730164,
      "rewards/semantic_correctness_reward_func/mean": 0.3969176113605499,
      "rewards/semantic_correctness_reward_func/std": 0.1971798837184906,
      "rewards/xmlcount_reward_func/mean": 0.733544647693634,
      "rewards/xmlcount_reward_func/std": 0.44044601917266846,
      "step": 33
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 445.0,
      "completions/mean_length": 149.41519165039062,
      "completions/mean_terminated_length": 145.4932861328125,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.01160855277196876,
      "grad_norm": 0.02668035216629505,
      "kl": 0.0008018910884857178,
      "learning_rate": 4.600000000000001e-06,
      "loss": -0.0001,
      "num_tokens": 11953490.0,
      "reward": 0.4242376387119293,
      "reward_std": 0.05728016048669815,
      "rewards/gemini_judge_reward_func/mean": 0.0982142835855484,
      "rewards/gemini_judge_reward_func/std": 0.20601151883602142,
      "rewards/semantic_correctness_reward_func/mean": 0.38750943541526794,
      "rewards/semantic_correctness_reward_func/std": 0.18285952508449554,
      "rewards/xmlcount_reward_func/mean": 0.7686249613761902,
      "rewards/xmlcount_reward_func/std": 0.42356839776039124,
      "step": 34
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 301.0,
      "completions/mean_length": 147.875,
      "completions/mean_terminated_length": 139.9819793701172,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.011949980794673723,
      "grad_norm": 0.029554614797234535,
      "kl": 0.0027062706649303436,
      "learning_rate": 4.800000000000001e-06,
      "loss": -0.0146,
      "num_tokens": 12302866.0,
      "reward": 0.47618868947029114,
      "reward_std": 0.05818319693207741,
      "rewards/gemini_judge_reward_func/mean": 0.1595982164144516,
      "rewards/gemini_judge_reward_func/std": 0.29934054613113403,
      "rewards/semantic_correctness_reward_func/mean": 0.41724681854248047,
      "rewards/semantic_correctness_reward_func/std": 0.2225155085325241,
      "rewards/xmlcount_reward_func/mean": 0.8222500085830688,
      "rewards/xmlcount_reward_func/std": 0.3842346966266632,
      "step": 35
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 391.0,
      "completions/mean_length": 146.45089721679688,
      "completions/mean_terminated_length": 134.53846740722656,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.012291408817378686,
      "grad_norm": 0.028692543506622314,
      "kl": 0.0030185282230377197,
      "learning_rate": 5e-06,
      "loss": -0.0039,
      "num_tokens": 12669655.0,
      "reward": 0.3967846930027008,
      "reward_std": 0.05047953501343727,
      "rewards/gemini_judge_reward_func/mean": 0.1082589253783226,
      "rewards/gemini_judge_reward_func/std": 0.23257403075695038,
      "rewards/semantic_correctness_reward_func/mean": 0.4089055061340332,
      "rewards/semantic_correctness_reward_func/std": 0.21490508317947388,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 36
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 264.0,
      "completions/mean_length": 149.4241180419922,
      "completions/mean_terminated_length": 137.5520477294922,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.01263283684008365,
      "grad_norm": 0.030691703781485558,
      "kl": 0.002063453197479248,
      "learning_rate": 5.2e-06,
      "loss": -0.0337,
      "num_tokens": 13029822.0,
      "reward": 0.42319977283477783,
      "reward_std": 0.06844579428434372,
      "rewards/gemini_judge_reward_func/mean": 0.1060267835855484,
      "rewards/gemini_judge_reward_func/std": 0.23599198460578918,
      "rewards/semantic_correctness_reward_func/mean": 0.4158558249473572,
      "rewards/semantic_correctness_reward_func/std": 0.20988070964813232,
      "rewards/xmlcount_reward_func/mean": 0.7440447211265564,
      "rewards/xmlcount_reward_func/std": 0.43694427609443665,
      "step": 37
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 868.0,
      "completions/mean_length": 159.00894165039062,
      "completions/mean_terminated_length": 147.2669677734375,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.012974264862788614,
      "grad_norm": 0.027180153876543045,
      "kl": 0.002780407667160034,
      "learning_rate": 5.400000000000001e-06,
      "loss": -0.0143,
      "num_tokens": 13386520.0,
      "reward": 0.4280446171760559,
      "reward_std": 0.07069174945354462,
      "rewards/gemini_judge_reward_func/mean": 0.1428571492433548,
      "rewards/gemini_judge_reward_func/std": 0.28156015276908875,
      "rewards/semantic_correctness_reward_func/mean": 0.4245087206363678,
      "rewards/semantic_correctness_reward_func/std": 0.21585093438625336,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 38
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 924.0,
      "completions/mean_length": 152.5803680419922,
      "completions/mean_terminated_length": 144.729736328125,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.013315692885493577,
      "grad_norm": 0.02773498371243477,
      "kl": 0.00479482114315033,
      "learning_rate": 5.600000000000001e-06,
      "loss": -0.0215,
      "num_tokens": 13759430.0,
      "reward": 0.37968793511390686,
      "reward_std": 0.055167291313409805,
      "rewards/gemini_judge_reward_func/mean": 0.0904017835855484,
      "rewards/gemini_judge_reward_func/std": 0.1978796124458313,
      "rewards/semantic_correctness_reward_func/mean": 0.3948860466480255,
      "rewards/semantic_correctness_reward_func/std": 0.18360565602779388,
      "rewards/xmlcount_reward_func/mean": 0.6613750457763672,
      "rewards/xmlcount_reward_func/std": 0.47500187158584595,
      "step": 39
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 830.0,
      "completions/mean_length": 147.25,
      "completions/mean_terminated_length": 135.34841918945312,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.01365712090819854,
      "grad_norm": 0.025785459205508232,
      "kl": 0.006071865558624268,
      "learning_rate": 5.8e-06,
      "loss": -0.0544,
      "num_tokens": 14144086.0,
      "reward": 0.3708299696445465,
      "reward_std": 0.06480063498020172,
      "rewards/gemini_judge_reward_func/mean": 0.0915178582072258,
      "rewards/gemini_judge_reward_func/std": 0.21174997091293335,
      "rewards/semantic_correctness_reward_func/mean": 0.4064801037311554,
      "rewards/semantic_correctness_reward_func/std": 0.1985797882080078,
      "rewards/xmlcount_reward_func/mean": 0.6323170065879822,
      "rewards/xmlcount_reward_func/std": 0.48041653633117676,
      "step": 40
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 146.66964721679688,
      "completions/mean_terminated_length": 138.76576232910156,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.013998548930903503,
      "grad_norm": 0.025612158700823784,
      "kl": 0.0058727264404296875,
      "learning_rate": 6e-06,
      "loss": -0.0175,
      "num_tokens": 14512272.0,
      "reward": 0.39734622836112976,
      "reward_std": 0.0646364837884903,
      "rewards/gemini_judge_reward_func/mean": 0.1305803507566452,
      "rewards/gemini_judge_reward_func/std": 0.24711813032627106,
      "rewards/semantic_correctness_reward_func/mean": 0.43632930517196655,
      "rewards/semantic_correctness_reward_func/std": 0.19070696830749512,
      "rewards/xmlcount_reward_func/mean": 0.6446205973625183,
      "rewards/xmlcount_reward_func/std": 0.479495108127594,
      "step": 41
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 889.0,
      "completions/mean_length": 137.9866180419922,
      "completions/mean_terminated_length": 130.00450134277344,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.014339976953608467,
      "grad_norm": 0.029739174991846085,
      "kl": 0.007889151573181152,
      "learning_rate": 6.200000000000001e-06,
      "loss": -0.0672,
      "num_tokens": 14863201.0,
      "reward": 0.3775990903377533,
      "reward_std": 0.05630933493375778,
      "rewards/gemini_judge_reward_func/mean": 0.0814732164144516,
      "rewards/gemini_judge_reward_func/std": 0.20312152802944183,
      "rewards/semantic_correctness_reward_func/mean": 0.3665488064289093,
      "rewards/semantic_correctness_reward_func/std": 0.19233566522598267,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 42
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 880.0,
      "completions/max_terminated_length": 880.0,
      "completions/mean_length": 143.08929443359375,
      "completions/mean_terminated_length": 143.08929443359375,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.014681404976313431,
      "grad_norm": 0.02772599086165428,
      "kl": 0.004380345344543457,
      "learning_rate": 6.4000000000000006e-06,
      "loss": -0.0156,
      "num_tokens": 15217021.0,
      "reward": 0.41600289940834045,
      "reward_std": 0.06771310418844223,
      "rewards/gemini_judge_reward_func/mean": 0.1149553582072258,
      "rewards/gemini_judge_reward_func/std": 0.21152234077453613,
      "rewards/semantic_correctness_reward_func/mean": 0.42010369896888733,
      "rewards/semantic_correctness_reward_func/std": 0.18937553465366364,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 43
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 300.0,
      "completions/mean_length": 134.53125,
      "completions/mean_terminated_length": 130.5426025390625,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.015022832999018394,
      "grad_norm": 0.025546282529830933,
      "kl": 0.0051773786544799805,
      "learning_rate": 6.600000000000001e-06,
      "loss": 0.0141,
      "num_tokens": 15581232.0,
      "reward": 0.3940742611885071,
      "reward_std": 0.06104440987110138,
      "rewards/gemini_judge_reward_func/mean": 0.125,
      "rewards/gemini_judge_reward_func/std": 0.2593541443347931,
      "rewards/semantic_correctness_reward_func/mean": 0.45124611258506775,
      "rewards/semantic_correctness_reward_func/std": 0.2126028686761856,
      "rewards/xmlcount_reward_func/mean": 0.6345625519752502,
      "rewards/xmlcount_reward_func/std": 0.48329102993011475,
      "step": 44
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 284.0,
      "completions/mean_length": 140.85714721679688,
      "completions/mean_terminated_length": 128.86878967285156,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.015364261021723358,
      "grad_norm": 0.027798650786280632,
      "kl": 0.006904497742652893,
      "learning_rate": 6.800000000000001e-06,
      "loss": 0.0193,
      "num_tokens": 15928136.0,
      "reward": 0.40762490034103394,
      "reward_std": 0.06153449788689613,
      "rewards/gemini_judge_reward_func/mean": 0.1171875,
      "rewards/gemini_judge_reward_func/std": 0.21424943208694458,
      "rewards/semantic_correctness_reward_func/mean": 0.4541868567466736,
      "rewards/semantic_correctness_reward_func/std": 0.20098300278186798,
      "rewards/xmlcount_reward_func/mean": 0.6747812628746033,
      "rewards/xmlcount_reward_func/std": 0.4702269732952118,
      "step": 45
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 271.0,
      "completions/mean_length": 148.0803680419922,
      "completions/mean_terminated_length": 132.154541015625,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.01570568904442832,
      "grad_norm": 0.028658276423811913,
      "kl": 0.006076395511627197,
      "learning_rate": 7e-06,
      "loss": 0.016,
      "num_tokens": 16311014.0,
      "reward": 0.39668479561805725,
      "reward_std": 0.063643679022789,
      "rewards/gemini_judge_reward_func/mean": 0.1138392835855484,
      "rewards/gemini_judge_reward_func/std": 0.23170354962348938,
      "rewards/semantic_correctness_reward_func/mean": 0.4329952597618103,
      "rewards/semantic_correctness_reward_func/std": 0.2143063247203827,
      "rewards/xmlcount_reward_func/mean": 0.6613750457763672,
      "rewards/xmlcount_reward_func/std": 0.47500187158584595,
      "step": 46
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 303.0,
      "completions/mean_length": 143.875,
      "completions/mean_terminated_length": 139.92825317382812,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.016047117067133285,
      "grad_norm": 0.027143213897943497,
      "kl": 0.0036936402320861816,
      "learning_rate": 7.2000000000000005e-06,
      "loss": -0.0194,
      "num_tokens": 16672714.0,
      "reward": 0.4406120777130127,
      "reward_std": 0.0816921517252922,
      "rewards/gemini_judge_reward_func/mean": 0.1205357164144516,
      "rewards/gemini_judge_reward_func/std": 0.24939869344234467,
      "rewards/semantic_correctness_reward_func/mean": 0.44264957308769226,
      "rewards/semantic_correctness_reward_func/std": 0.22831664979457855,
      "rewards/xmlcount_reward_func/mean": 0.7596697211265564,
      "rewards/xmlcount_reward_func/std": 0.42911025881767273,
      "step": 47
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 708.0,
      "completions/mean_length": 133.13394165039062,
      "completions/mean_terminated_length": 125.10810852050781,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.01638854508983825,
      "grad_norm": 0.02828553505241871,
      "kl": 0.011866092681884766,
      "learning_rate": 7.4e-06,
      "loss": -0.0569,
      "num_tokens": 17044560.0,
      "reward": 0.3188920021057129,
      "reward_std": 0.03988515958189964,
      "rewards/gemini_judge_reward_func/mean": 0.0535714291036129,
      "rewards/gemini_judge_reward_func/std": 0.1862076371908188,
      "rewards/semantic_correctness_reward_func/mean": 0.3433171808719635,
      "rewards/semantic_correctness_reward_func/std": 0.18960954248905182,
      "rewards/xmlcount_reward_func/mean": 0.5720000267028809,
      "rewards/xmlcount_reward_func/std": 0.4964759945869446,
      "step": 48
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 543.0,
      "completions/mean_length": 143.87054443359375,
      "completions/mean_terminated_length": 131.92308044433594,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.016729973112543213,
      "grad_norm": 0.028787607327103615,
      "kl": 0.010211586952209473,
      "learning_rate": 7.600000000000001e-06,
      "loss": -0.0146,
      "num_tokens": 17410671.0,
      "reward": 0.38920655846595764,
      "reward_std": 0.07314638048410416,
      "rewards/gemini_judge_reward_func/mean": 0.1205357164144516,
      "rewards/gemini_judge_reward_func/std": 0.2295006811618805,
      "rewards/semantic_correctness_reward_func/mean": 0.39355048537254333,
      "rewards/semantic_correctness_reward_func/std": 0.2066669762134552,
      "rewards/xmlcount_reward_func/mean": 0.6557053923606873,
      "rewards/xmlcount_reward_func/std": 0.510606586933136,
      "step": 49
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 965.0,
      "completions/max_terminated_length": 965.0,
      "completions/mean_length": 140.32144165039062,
      "completions/mean_terminated_length": 140.32144165039062,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.017071401135248177,
      "grad_norm": 0.026758279651403427,
      "kl": 0.0061858296394348145,
      "learning_rate": 7.800000000000002e-06,
      "loss": -0.0145,
      "num_tokens": 17774383.0,
      "reward": 0.4114672839641571,
      "reward_std": 0.06060100719332695,
      "rewards/gemini_judge_reward_func/mean": 0.0904017835855484,
      "rewards/gemini_judge_reward_func/std": 0.20346620678901672,
      "rewards/semantic_correctness_reward_func/mean": 0.41078296303749084,
      "rewards/semantic_correctness_reward_func/std": 0.2236628234386444,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 50
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 162.0803680419922,
      "completions/mean_terminated_length": 154.31532287597656,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.017412829157953138,
      "grad_norm": 0.02499496378004551,
      "kl": 0.0020468831062316895,
      "learning_rate": 8.000000000000001e-06,
      "loss": 0.0036,
      "num_tokens": 18123241.0,
      "reward": 0.4477907717227936,
      "reward_std": 0.06217062473297119,
      "rewards/gemini_judge_reward_func/mean": 0.1037946417927742,
      "rewards/gemini_judge_reward_func/std": 0.22855830192565918,
      "rewards/semantic_correctness_reward_func/mean": 0.4226144850254059,
      "rewards/semantic_correctness_reward_func/std": 0.2091536521911621,
      "rewards/xmlcount_reward_func/mean": 0.8043751120567322,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 51
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 317.0,
      "completions/max_terminated_length": 317.0,
      "completions/mean_length": 145.8169708251953,
      "completions/mean_terminated_length": 145.8169708251953,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.017754257180658102,
      "grad_norm": 0.025865867733955383,
      "kl": 0.00410914421081543,
      "learning_rate": 8.2e-06,
      "loss": -0.0136,
      "num_tokens": 18445812.0,
      "reward": 0.4681364595890045,
      "reward_std": 0.07395092397928238,
      "rewards/gemini_judge_reward_func/mean": 0.1305803507566452,
      "rewards/gemini_judge_reward_func/std": 0.26884591579437256,
      "rewards/semantic_correctness_reward_func/mean": 0.4350215494632721,
      "rewards/semantic_correctness_reward_func/std": 0.2260739952325821,
      "rewards/xmlcount_reward_func/mean": 0.8222500085830688,
      "rewards/xmlcount_reward_func/std": 0.3842346966266632,
      "step": 52
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 290.0,
      "completions/mean_length": 154.15179443359375,
      "completions/mean_terminated_length": 138.33636474609375,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.018095685203363066,
      "grad_norm": 0.026150401681661606,
      "kl": 0.005943477153778076,
      "learning_rate": 8.400000000000001e-06,
      "loss": 0.0209,
      "num_tokens": 18803846.0,
      "reward": 0.45366746187210083,
      "reward_std": 0.07954549789428711,
      "rewards/gemini_judge_reward_func/mean": 0.1729910671710968,
      "rewards/gemini_judge_reward_func/std": 0.3112905025482178,
      "rewards/semantic_correctness_reward_func/mean": 0.4566049575805664,
      "rewards/semantic_correctness_reward_func/std": 0.2221759408712387,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 53
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 332.0,
      "completions/mean_length": 139.625,
      "completions/mean_terminated_length": 135.65919494628906,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.01843711322606803,
      "grad_norm": 0.024718625470995903,
      "kl": 0.006244301795959473,
      "learning_rate": 8.6e-06,
      "loss": 0.0007,
      "num_tokens": 19149074.0,
      "reward": 0.44792598485946655,
      "reward_std": 0.06949326395988464,
      "rewards/gemini_judge_reward_func/mean": 0.1495535671710968,
      "rewards/gemini_judge_reward_func/std": 0.25926730036735535,
      "rewards/semantic_correctness_reward_func/mean": 0.44349589943885803,
      "rewards/semantic_correctness_reward_func/std": 0.20944607257843018,
      "rewards/xmlcount_reward_func/mean": 0.748513400554657,
      "rewards/xmlcount_reward_func/std": 0.43441200256347656,
      "step": 54
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 565.0,
      "completions/mean_length": 165.9241180419922,
      "completions/mean_terminated_length": 146.3333282470703,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.018778541248772995,
      "grad_norm": 0.024257266893982887,
      "kl": 0.009581208229064941,
      "learning_rate": 8.8e-06,
      "loss": -0.0089,
      "num_tokens": 19531933.0,
      "reward": 0.403461754322052,
      "reward_std": 0.06676840037107468,
      "rewards/gemini_judge_reward_func/mean": 0.1283482164144516,
      "rewards/gemini_judge_reward_func/std": 0.27608317136764526,
      "rewards/semantic_correctness_reward_func/mean": 0.4199782907962799,
      "rewards/semantic_correctness_reward_func/std": 0.25535663962364197,
      "rewards/xmlcount_reward_func/mean": 0.6703169941902161,
      "rewards/xmlcount_reward_func/std": 0.46707943081855774,
      "step": 55
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 957.0,
      "completions/mean_length": 162.83929443359375,
      "completions/mean_terminated_length": 155.08108520507812,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.019119969271477955,
      "grad_norm": 0.024257266893982887,
      "kl": 0.006259918212890625,
      "learning_rate": 8.8e-06,
      "loss": -0.0184,
      "num_tokens": 19886317.0,
      "reward": 0.4279636740684509,
      "reward_std": 0.06266574561595917,
      "rewards/gemini_judge_reward_func/mean": 0.0926339253783226,
      "rewards/gemini_judge_reward_func/std": 0.19683989882469177,
      "rewards/semantic_correctness_reward_func/mean": 0.4396754205226898,
      "rewards/semantic_correctness_reward_func/std": 0.2028336226940155,
      "rewards/xmlcount_reward_func/mean": 0.7574375867843628,
      "rewards/xmlcount_reward_func/std": 0.42914968729019165,
      "step": 56
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 238.0,
      "completions/mean_length": 137.7053680419922,
      "completions/mean_terminated_length": 133.73094177246094,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.01946139729418292,
      "grad_norm": 0.027076715603470802,
      "kl": 0.01098167896270752,
      "learning_rate": 9e-06,
      "loss": 0.0159,
      "num_tokens": 20248451.0,
      "reward": 0.3951142132282257,
      "reward_std": 0.06582393497228622,
      "rewards/gemini_judge_reward_func/mean": 0.1037946417927742,
      "rewards/gemini_judge_reward_func/std": 0.23698663711547852,
      "rewards/semantic_correctness_reward_func/mean": 0.4094816744327545,
      "rewards/semantic_correctness_reward_func/std": 0.21601319313049316,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 57
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 329.0,
      "completions/max_terminated_length": 329.0,
      "completions/mean_length": 144.91964721679688,
      "completions/mean_terminated_length": 144.91964721679688,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.019802825316887884,
      "grad_norm": 0.024703042581677437,
      "kl": 0.00683748722076416,
      "learning_rate": 9.200000000000002e-06,
      "loss": -0.0068,
      "num_tokens": 20557377.0,
      "reward": 0.444562703371048,
      "reward_std": 0.06522774696350098,
      "rewards/gemini_judge_reward_func/mean": 0.1183035746216774,
      "rewards/gemini_judge_reward_func/std": 0.20355534553527832,
      "rewards/semantic_correctness_reward_func/mean": 0.4221436679363251,
      "rewards/semantic_correctness_reward_func/std": 0.20329251885414124,
      "rewards/xmlcount_reward_func/mean": 0.7820313572883606,
      "rewards/xmlcount_reward_func/std": 0.41473886370658875,
      "step": 58
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 260.0,
      "completions/mean_length": 151.1919708251953,
      "completions/mean_terminated_length": 139.34390258789062,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.020144253339592848,
      "grad_norm": 0.028883758932352066,
      "kl": 0.010497450828552246,
      "learning_rate": 9.4e-06,
      "loss": -0.0058,
      "num_tokens": 20920732.0,
      "reward": 0.44578060507774353,
      "reward_std": 0.07391282916069031,
      "rewards/gemini_judge_reward_func/mean": 0.1238839253783226,
      "rewards/gemini_judge_reward_func/std": 0.2527846693992615,
      "rewards/semantic_correctness_reward_func/mean": 0.4438849985599518,
      "rewards/semantic_correctness_reward_func/std": 0.2132381945848465,
      "rewards/xmlcount_reward_func/mean": 0.7686249613761902,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 59
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 960.0,
      "completions/mean_length": 148.88839721679688,
      "completions/mean_terminated_length": 144.96412658691406,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.020485681362297812,
      "grad_norm": 0.026241201907396317,
      "kl": 0.00708240270614624,
      "learning_rate": 9.600000000000001e-06,
      "loss": -0.016,
      "num_tokens": 21251643.0,
      "reward": 0.433006227016449,
      "reward_std": 0.06476236879825592,
      "rewards/gemini_judge_reward_func/mean": 0.1261160671710968,
      "rewards/gemini_judge_reward_func/std": 0.22584888339042664,
      "rewards/semantic_correctness_reward_func/mean": 0.4112989008426666,
      "rewards/semantic_correctness_reward_func/std": 0.23198209702968597,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 60
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 306.0,
      "completions/mean_length": 163.16519165039062,
      "completions/mean_terminated_length": 143.51141357421875,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.020827109385002773,
      "grad_norm": 0.025193244218826294,
      "kl": 0.007884740829467773,
      "learning_rate": 9.800000000000001e-06,
      "loss": -0.0213,
      "num_tokens": 21602088.0,
      "reward": 0.45057666301727295,
      "reward_std": 0.0753309428691864,
      "rewards/gemini_judge_reward_func/mean": 0.1183035746216774,
      "rewards/gemini_judge_reward_func/std": 0.23784302175045013,
      "rewards/semantic_correctness_reward_func/mean": 0.4097670912742615,
      "rewards/semantic_correctness_reward_func/std": 0.2002406269311905,
      "rewards/xmlcount_reward_func/mean": 0.8032545447349548,
      "rewards/xmlcount_reward_func/std": 0.3983818590641022,
      "step": 61
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 336.0,
      "completions/mean_length": 143.79464721679688,
      "completions/mean_terminated_length": 139.8475341796875,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.021168537407707737,
      "grad_norm": 0.02755032107234001,
      "kl": 0.006412327289581299,
      "learning_rate": 1e-05,
      "loss": -0.0059,
      "num_tokens": 21971558.0,
      "reward": 0.44730687141418457,
      "reward_std": 0.05102415755391121,
      "rewards/gemini_judge_reward_func/mean": 0.0848214253783226,
      "rewards/gemini_judge_reward_func/std": 0.19394874572753906,
      "rewards/semantic_correctness_reward_func/mean": 0.386641263961792,
      "rewards/semantic_correctness_reward_func/std": 0.18178777396678925,
      "rewards/xmlcount_reward_func/mean": 0.8401250839233398,
      "rewards/xmlcount_reward_func/std": 0.3684578537940979,
      "step": 62
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 869.0,
      "completions/mean_length": 158.55804443359375,
      "completions/mean_terminated_length": 150.76126098632812,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.0215099654304127,
      "grad_norm": 0.024078436195850372,
      "kl": 0.007021784782409668,
      "learning_rate": 9.999972660400536e-06,
      "loss": -0.0319,
      "num_tokens": 22334655.0,
      "reward": 0.460908979177475,
      "reward_std": 0.0708736777305603,
      "rewards/gemini_judge_reward_func/mean": 0.1305803507566452,
      "rewards/gemini_judge_reward_func/std": 0.27298402786254883,
      "rewards/semantic_correctness_reward_func/mean": 0.43463388085365295,
      "rewards/semantic_correctness_reward_func/std": 0.2317088544368744,
      "rewards/xmlcount_reward_func/mean": 0.8043750524520874,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 63
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 453.0,
      "completions/mean_length": 160.65179443359375,
      "completions/mean_terminated_length": 136.88990783691406,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.021851393453117665,
      "grad_norm": 0.024365782737731934,
      "kl": 0.0076389312744140625,
      "learning_rate": 9.999890641901124e-06,
      "loss": -0.0079,
      "num_tokens": 22692925.0,
      "reward": 0.4045267403125763,
      "reward_std": 0.06387098878622055,
      "rewards/gemini_judge_reward_func/mean": 0.1372767835855484,
      "rewards/gemini_judge_reward_func/std": 0.23524853587150574,
      "rewards/semantic_correctness_reward_func/mean": 0.4610799252986908,
      "rewards/semantic_correctness_reward_func/std": 0.19871395826339722,
      "rewards/xmlcount_reward_func/mean": 0.643500030040741,
      "rewards/xmlcount_reward_func/std": 0.48071083426475525,
      "step": 64
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 676.0,
      "completions/max_terminated_length": 676.0,
      "completions/mean_length": 146.6741180419922,
      "completions/mean_terminated_length": 146.6741180419922,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.02219282147582263,
      "grad_norm": 0.023962823674082756,
      "kl": 0.006612420082092285,
      "learning_rate": 9.999753945398704e-06,
      "loss": -0.0137,
      "num_tokens": 23054064.0,
      "reward": 0.4475335478782654,
      "reward_std": 0.06912019103765488,
      "rewards/gemini_judge_reward_func/mean": 0.1428571492433548,
      "rewards/gemini_judge_reward_func/std": 0.2683078646659851,
      "rewards/semantic_correctness_reward_func/mean": 0.414703369140625,
      "rewards/semantic_correctness_reward_func/std": 0.20449979603290558,
      "rewards/xmlcount_reward_func/mean": 0.7686249613761902,
      "rewards/xmlcount_reward_func/std": 0.42356839776039124,
      "step": 65
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 893.0,
      "completions/mean_length": 160.45982360839844,
      "completions/mean_terminated_length": 144.7590789794922,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.02253424949852759,
      "grad_norm": 0.02477749064564705,
      "kl": 0.011017203330993652,
      "learning_rate": 9.99956257238817e-06,
      "loss": -0.0039,
      "num_tokens": 23412271.0,
      "reward": 0.4194851815700531,
      "reward_std": 0.06413974612951279,
      "rewards/gemini_judge_reward_func/mean": 0.1305803507566452,
      "rewards/gemini_judge_reward_func/std": 0.24138078093528748,
      "rewards/semantic_correctness_reward_func/mean": 0.44201499223709106,
      "rewards/semantic_correctness_reward_func/std": 0.2022552639245987,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903594970703,
      "step": 66
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 506.0,
      "completions/max_terminated_length": 506.0,
      "completions/mean_length": 152.97769165039062,
      "completions/mean_terminated_length": 152.97769165039062,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.022875677521232554,
      "grad_norm": 0.02420092560350895,
      "kl": 0.004943966865539551,
      "learning_rate": 9.999316524962347e-06,
      "loss": 0.0069,
      "num_tokens": 23778402.0,
      "reward": 0.45530790090560913,
      "reward_std": 0.06338375061750412,
      "rewards/gemini_judge_reward_func/mean": 0.1216517835855484,
      "rewards/gemini_judge_reward_func/std": 0.21953363716602325,
      "rewards/semantic_correctness_reward_func/mean": 0.43342334032058716,
      "rewards/semantic_correctness_reward_func/std": 0.19556362926959991,
      "rewards/xmlcount_reward_func/mean": 0.799906313419342,
      "rewards/xmlcount_reward_func/std": 0.40196701884269714,
      "step": 67
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 938.0,
      "completions/mean_length": 160.02679443359375,
      "completions/mean_terminated_length": 148.29864501953125,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.02321710554393752,
      "grad_norm": 0.024799056351184845,
      "kl": 0.012816905975341797,
      "learning_rate": 9.999015805811965e-06,
      "loss": -0.0062,
      "num_tokens": 24150064.0,
      "reward": 0.4068303406238556,
      "reward_std": 0.0636182427406311,
      "rewards/gemini_judge_reward_func/mean": 0.1116071417927742,
      "rewards/gemini_judge_reward_func/std": 0.20322315394878387,
      "rewards/semantic_correctness_reward_func/mean": 0.4032764434814453,
      "rewards/semantic_correctness_reward_func/std": 0.2195780724287033,
      "rewards/xmlcount_reward_func/mean": 0.7038304209709167,
      "rewards/xmlcount_reward_func/std": 0.4546702206134796,
      "step": 68
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0357142857142857,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 630.0,
      "completions/mean_length": 181.34376525878906,
      "completions/mean_terminated_length": 150.13426208496094,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.023558533566642482,
      "grad_norm": 0.02307036519050598,
      "kl": 0.005774140357971191,
      "learning_rate": 9.998660418225645e-06,
      "loss": -0.0188,
      "num_tokens": 24533433.0,
      "reward": 0.3904527425765991,
      "reward_std": 0.06628313660621643,
      "rewards/gemini_judge_reward_func/mean": 0.0870535746216774,
      "rewards/gemini_judge_reward_func/std": 0.1809597611427307,
      "rewards/semantic_correctness_reward_func/mean": 0.39731696248054504,
      "rewards/semantic_correctness_reward_func/std": 0.19073733687400818,
      "rewards/xmlcount_reward_func/mean": 0.6904196739196777,
      "rewards/xmlcount_reward_func/std": 0.4628920555114746,
      "step": 69
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 373.0,
      "completions/mean_length": 159.8928680419922,
      "completions/mean_terminated_length": 140.1643829345703,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.023899961589347447,
      "grad_norm": 0.025376953184604645,
      "kl": 0.00702059268951416,
      "learning_rate": 9.998250366089848e-06,
      "loss": -0.0119,
      "num_tokens": 24890453.0,
      "reward": 0.3956013023853302,
      "reward_std": 0.062416452914476395,
      "rewards/gemini_judge_reward_func/mean": 0.1049107164144516,
      "rewards/gemini_judge_reward_func/std": 0.18533207476139069,
      "rewards/semantic_correctness_reward_func/mean": 0.4300599694252014,
      "rewards/semantic_correctness_reward_func/std": 0.21468216180801392,
      "rewards/xmlcount_reward_func/mean": 0.6690624952316284,
      "rewards/xmlcount_reward_func/std": 0.5187152624130249,
      "step": 70
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 761.0,
      "completions/mean_length": 157.625,
      "completions/mean_terminated_length": 149.81982421875,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.02424138961205241,
      "grad_norm": 0.025495875626802444,
      "kl": 0.005542397499084473,
      "learning_rate": 9.997785653888835e-06,
      "loss": 0.0107,
      "num_tokens": 25250817.0,
      "reward": 0.3918258845806122,
      "reward_std": 0.04926810413599014,
      "rewards/gemini_judge_reward_func/mean": 0.0591517873108387,
      "rewards/gemini_judge_reward_func/std": 0.16611815989017487,
      "rewards/semantic_correctness_reward_func/mean": 0.3750758469104767,
      "rewards/semantic_correctness_reward_func/std": 0.16816116869449615,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 71
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 855.0,
      "completions/mean_length": 161.25894165039062,
      "completions/mean_terminated_length": 149.54751586914062,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.02458281763475737,
      "grad_norm": 0.02477003075182438,
      "kl": 0.008527755737304688,
      "learning_rate": 9.99726628670463e-06,
      "loss": -0.0166,
      "num_tokens": 25625067.0,
      "reward": 0.4139401614665985,
      "reward_std": 0.07099711894989014,
      "rewards/gemini_judge_reward_func/mean": 0.1417410671710968,
      "rewards/gemini_judge_reward_func/std": 0.29332807660102844,
      "rewards/semantic_correctness_reward_func/mean": 0.4232363700866699,
      "rewards/semantic_correctness_reward_func/std": 0.21512280404567719,
      "rewards/xmlcount_reward_func/mean": 0.6814910769462585,
      "rewards/xmlcount_reward_func/std": 0.4640570878982544,
      "step": 72
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 955.0,
      "completions/mean_length": 155.7857208251953,
      "completions/mean_terminated_length": 151.8923797607422,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.024924245657462336,
      "grad_norm": 0.025401996448636055,
      "kl": 0.0072678327560424805,
      "learning_rate": 9.996692270216946e-06,
      "loss": 0.0065,
      "num_tokens": 25947051.0,
      "reward": 0.4428517818450928,
      "reward_std": 0.07458332180976868,
      "rewards/gemini_judge_reward_func/mean": 0.1205357164144516,
      "rewards/gemini_judge_reward_func/std": 0.23313553631305695,
      "rewards/semantic_correctness_reward_func/mean": 0.43593719601631165,
      "rewards/semantic_correctness_reward_func/std": 0.20209956169128418,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 73
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 528.0,
      "completions/mean_length": 160.8303680419922,
      "completions/mean_terminated_length": 149.11312866210938,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.0252656736801673,
      "grad_norm": 0.023690922185778618,
      "kl": 0.008687734603881836,
      "learning_rate": 9.996063610703138e-06,
      "loss": -0.0026,
      "num_tokens": 26289245.0,
      "reward": 0.4326671361923218,
      "reward_std": 0.07397673279047012,
      "rewards/gemini_judge_reward_func/mean": 0.1484375,
      "rewards/gemini_judge_reward_func/std": 0.2781420946121216,
      "rewards/semantic_correctness_reward_func/mean": 0.4355142414569855,
      "rewards/semantic_correctness_reward_func/std": 0.20559628307819366,
      "rewards/xmlcount_reward_func/mean": 0.7154732942581177,
      "rewards/xmlcount_reward_func/std": 0.5058081746101379,
      "step": 74
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 321.0,
      "completions/mean_length": 165.9553680419922,
      "completions/mean_terminated_length": 142.33944702148438,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.025607101702872264,
      "grad_norm": 0.02803461067378521,
      "kl": 0.01035165786743164,
      "learning_rate": 9.995380315038119e-06,
      "loss": 0.0036,
      "num_tokens": 26674083.0,
      "reward": 0.3838358223438263,
      "reward_std": 0.06592860817909241,
      "rewards/gemini_judge_reward_func/mean": 0.1372767835855484,
      "rewards/gemini_judge_reward_func/std": 0.25469791889190674,
      "rewards/semantic_correctness_reward_func/mean": 0.4291253685951233,
      "rewards/semantic_correctness_reward_func/std": 0.21939414739608765,
      "rewards/xmlcount_reward_func/mean": 0.6077500581741333,
      "rewards/xmlcount_reward_func/std": 0.48996880650520325,
      "step": 75
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 794.0,
      "completions/max_terminated_length": 794.0,
      "completions/mean_length": 158.9866180419922,
      "completions/mean_terminated_length": 158.9866180419922,
      "completions/min_length": 36.0,
      "completions/min_terminated_length": 36.0,
      "epoch": 0.025948529725577228,
      "grad_norm": 0.025290068238973618,
      "kl": 0.006925344467163086,
      "learning_rate": 9.994642390694308e-06,
      "loss": -0.0185,
      "num_tokens": 27000188.0,
      "reward": 0.44614139199256897,
      "reward_std": 0.043179940432310104,
      "rewards/gemini_judge_reward_func/mean": 0.0770089253783226,
      "rewards/gemini_judge_reward_func/std": 0.19360975921154022,
      "rewards/semantic_correctness_reward_func/mean": 0.3964388370513916,
      "rewards/semantic_correctness_reward_func/std": 0.1987782120704651,
      "rewards/xmlcount_reward_func/mean": 0.8401250839233398,
      "rewards/xmlcount_reward_func/std": 0.3684578835964203,
      "step": 76
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 524.0,
      "completions/mean_length": 159.27232360839844,
      "completions/mean_terminated_length": 151.4819793701172,
      "completions/min_length": 36.0,
      "completions/min_terminated_length": 36.0,
      "epoch": 0.02628995774828219,
      "grad_norm": 0.023038053885102272,
      "kl": 0.009779095649719238,
      "learning_rate": 9.993849845741525e-06,
      "loss": -0.0165,
      "num_tokens": 27375557.0,
      "reward": 0.40907853841781616,
      "reward_std": 0.07521604001522064,
      "rewards/gemini_judge_reward_func/mean": 0.1339285671710968,
      "rewards/gemini_judge_reward_func/std": 0.2708328068256378,
      "rewards/semantic_correctness_reward_func/mean": 0.419696182012558,
      "rewards/semantic_correctness_reward_func/std": 0.20244161784648895,
      "rewards/xmlcount_reward_func/mean": 0.6789196729660034,
      "rewards/xmlcount_reward_func/std": 0.4654209613800049,
      "step": 77
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 997.0,
      "completions/mean_length": 167.2366180419922,
      "completions/mean_terminated_length": 155.60633850097656,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.026631385770987153,
      "grad_norm": 0.023966865614056587,
      "kl": 0.009274482727050781,
      "learning_rate": 9.993002688846913e-06,
      "loss": 0.0102,
      "num_tokens": 27739970.0,
      "reward": 0.4366755783557892,
      "reward_std": 0.07375102490186691,
      "rewards/gemini_judge_reward_func/mean": 0.1439732164144516,
      "rewards/gemini_judge_reward_func/std": 0.2324018031358719,
      "rewards/semantic_correctness_reward_func/mean": 0.4509579837322235,
      "rewards/semantic_correctness_reward_func/std": 0.18391193449497223,
      "rewards/xmlcount_reward_func/mean": 0.7222366333007812,
      "rewards/xmlcount_reward_func/std": 0.4450782239437103,
      "step": 78
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0535714285714286,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 866.0,
      "completions/mean_length": 188.74107360839844,
      "completions/mean_terminated_length": 141.46226501464844,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.026972813793692117,
      "grad_norm": 0.024487853050231934,
      "kl": 0.01374959945678711,
      "learning_rate": 9.992100929274848e-06,
      "loss": -0.0392,
      "num_tokens": 28137276.0,
      "reward": 0.35368138551712036,
      "reward_std": 0.06350585073232651,
      "rewards/gemini_judge_reward_func/mean": 0.1238839253783226,
      "rewards/gemini_judge_reward_func/std": 0.23437467217445374,
      "rewards/semantic_correctness_reward_func/mean": 0.4414603114128113,
      "rewards/semantic_correctness_reward_func/std": 0.19774943590164185,
      "rewards/xmlcount_reward_func/mean": 0.5395892858505249,
      "rewards/xmlcount_reward_func/std": 0.4981267750263214,
      "step": 79
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 407.0,
      "completions/mean_length": 164.45089721679688,
      "completions/mean_terminated_length": 140.7935791015625,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.02731424181639708,
      "grad_norm": 0.02403208427131176,
      "kl": 0.011779546737670898,
      "learning_rate": 9.991144576886824e-06,
      "loss": -0.0213,
      "num_tokens": 28517273.0,
      "reward": 0.3964942395687103,
      "reward_std": 0.058009881526231766,
      "rewards/gemini_judge_reward_func/mean": 0.1361607164144516,
      "rewards/gemini_judge_reward_func/std": 0.24118632078170776,
      "rewards/semantic_correctness_reward_func/mean": 0.4231494963169098,
      "rewards/semantic_correctness_reward_func/std": 0.2020299881696701,
      "rewards/xmlcount_reward_func/mean": 0.643500030040741,
      "rewards/xmlcount_reward_func/std": 0.48071083426475525,
      "step": 80
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 895.0,
      "completions/mean_length": 161.32144165039062,
      "completions/mean_terminated_length": 157.4529266357422,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "epoch": 0.027655669839102046,
      "grad_norm": 0.022105740383267403,
      "kl": 0.009316205978393555,
      "learning_rate": 9.990133642141359e-06,
      "loss": 0.0047,
      "num_tokens": 28865717.0,
      "reward": 0.44927382469177246,
      "reward_std": 0.06996231526136398,
      "rewards/gemini_judge_reward_func/mean": 0.1294642835855484,
      "rewards/gemini_judge_reward_func/std": 0.2233111709356308,
      "rewards/semantic_correctness_reward_func/mean": 0.4501902759075165,
      "rewards/semantic_correctness_reward_func/std": 0.20007607340812683,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 81
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 879.0,
      "completions/mean_length": 164.7857208251953,
      "completions/mean_terminated_length": 149.16363525390625,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "epoch": 0.027997097861807006,
      "grad_norm": 0.02345276065170765,
      "kl": 0.012184381484985352,
      "learning_rate": 9.989068136093873e-06,
      "loss": -0.0162,
      "num_tokens": 29228517.0,
      "reward": 0.44200220704078674,
      "reward_std": 0.08123025298118591,
      "rewards/gemini_judge_reward_func/mean": 0.1529017835855484,
      "rewards/gemini_judge_reward_func/std": 0.2866664230823517,
      "rewards/semantic_correctness_reward_func/mean": 0.4497699439525604,
      "rewards/semantic_correctness_reward_func/std": 0.21584469079971313,
      "rewards/xmlcount_reward_func/mean": 0.727218747138977,
      "rewards/xmlcount_reward_func/std": 0.4462124705314636,
      "step": 82
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 501.0,
      "completions/max_terminated_length": 501.0,
      "completions/mean_length": 150.43304443359375,
      "completions/mean_terminated_length": 150.43304443359375,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.02833852588451197,
      "grad_norm": 0.022828485816717148,
      "kl": 0.011949777603149414,
      "learning_rate": 9.987948070396572e-06,
      "loss": -0.0194,
      "num_tokens": 29601014.0,
      "reward": 0.42945489287376404,
      "reward_std": 0.06218741089105606,
      "rewards/gemini_judge_reward_func/mean": 0.140625,
      "rewards/gemini_judge_reward_func/std": 0.24440200626850128,
      "rewards/semantic_correctness_reward_func/mean": 0.4360244572162628,
      "rewards/semantic_correctness_reward_func/std": 0.2285270392894745,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 83
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 575.0,
      "completions/mean_length": 168.99107360839844,
      "completions/mean_terminated_length": 149.47030639648438,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.028679953907216935,
      "grad_norm": 0.025373326614499092,
      "kl": 0.01007533073425293,
      "learning_rate": 9.986773457298311e-06,
      "loss": -0.0258,
      "num_tokens": 29964904.0,
      "reward": 0.4488268494606018,
      "reward_std": 0.07115625590085983,
      "rewards/gemini_judge_reward_func/mean": 0.1071428582072258,
      "rewards/gemini_judge_reward_func/std": 0.2136441320180893,
      "rewards/semantic_correctness_reward_func/mean": 0.4210982918739319,
      "rewards/semantic_correctness_reward_func/std": 0.20950660109519958,
      "rewards/xmlcount_reward_func/mean": 0.8043751120567322,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 84
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 478.0,
      "completions/mean_length": 155.44644165039062,
      "completions/mean_terminated_length": 147.6216278076172,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "epoch": 0.0290213819299219,
      "grad_norm": 0.024450762197375298,
      "kl": 0.011575698852539062,
      "learning_rate": 9.985544309644474e-06,
      "loss": -0.0256,
      "num_tokens": 30329712.0,
      "reward": 0.4297153353691101,
      "reward_std": 0.06669414043426514,
      "rewards/gemini_judge_reward_func/mean": 0.1194196417927742,
      "rewards/gemini_judge_reward_func/std": 0.20902319252490997,
      "rewards/semantic_correctness_reward_func/mean": 0.4439873695373535,
      "rewards/semantic_correctness_reward_func/std": 0.18770352005958557,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 85
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 430.0,
      "completions/mean_length": 166.62054443359375,
      "completions/mean_terminated_length": 151.0318145751953,
      "completions/min_length": 36.0,
      "completions/min_terminated_length": 36.0,
      "epoch": 0.029362809952626863,
      "grad_norm": 0.02488037198781967,
      "kl": 0.009441137313842773,
      "learning_rate": 9.984260640876821e-06,
      "loss": -0.0263,
      "num_tokens": 30661951.0,
      "reward": 0.44441041350364685,
      "reward_std": 0.05848705768585205,
      "rewards/gemini_judge_reward_func/mean": 0.109375,
      "rewards/gemini_judge_reward_func/std": 0.23385359346866608,
      "rewards/semantic_correctness_reward_func/mean": 0.3945517838001251,
      "rewards/semantic_correctness_reward_func/std": 0.2169126272201538,
      "rewards/xmlcount_reward_func/mean": 0.8043751120567322,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 86
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 880.0,
      "completions/mean_length": 169.80357360839844,
      "completions/mean_terminated_length": 146.2935791015625,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "epoch": 0.029704237975331824,
      "grad_norm": 0.023754147812724113,
      "kl": 0.011398077011108398,
      "learning_rate": 9.98292246503335e-06,
      "loss": -0.0201,
      "num_tokens": 31028499.0,
      "reward": 0.39785102009773254,
      "reward_std": 0.05590759217739105,
      "rewards/gemini_judge_reward_func/mean": 0.1272321492433548,
      "rewards/gemini_judge_reward_func/std": 0.2527772784233093,
      "rewards/semantic_correctness_reward_func/mean": 0.44779059290885925,
      "rewards/semantic_correctness_reward_func/std": 0.1991998553276062,
      "rewards/xmlcount_reward_func/mean": 0.643500030040741,
      "rewards/xmlcount_reward_func/std": 0.48071083426475525,
      "step": 87
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 282.0,
      "completions/mean_length": 156.375,
      "completions/mean_terminated_length": 148.55856323242188,
      "completions/min_length": 45.0,
      "completions/min_terminated_length": 45.0,
      "epoch": 0.030045665998036788,
      "grad_norm": 0.023754147812724113,
      "kl": 0.009763479232788086,
      "learning_rate": 9.98292246503335e-06,
      "loss": 0.0098,
      "num_tokens": 31364211.0,
      "reward": 0.4608972370624542,
      "reward_std": 0.0726761519908905,
      "rewards/gemini_judge_reward_func/mean": 0.1428571492433548,
      "rewards/gemini_judge_reward_func/std": 0.23607943952083588,
      "rewards/semantic_correctness_reward_func/mean": 0.48152169585227966,
      "rewards/semantic_correctness_reward_func/std": 0.17897123098373413,
      "rewards/xmlcount_reward_func/mean": 0.7686249613761902,
      "rewards/xmlcount_reward_func/std": 0.42356839776039124,
      "step": 88
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 447.0,
      "completions/mean_length": 166.60269165039062,
      "completions/mean_terminated_length": 147.0273895263672,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.030387094020741752,
      "grad_norm": 0.02381015755236149,
      "kl": 0.014133691787719727,
      "learning_rate": 9.981529796748135e-06,
      "loss": 0.0053,
      "num_tokens": 31723014.0,
      "reward": 0.4156407117843628,
      "reward_std": 0.06060226634144783,
      "rewards/gemini_judge_reward_func/mean": 0.1082589253783226,
      "rewards/gemini_judge_reward_func/std": 0.23136581480503082,
      "rewards/semantic_correctness_reward_func/mean": 0.43168550729751587,
      "rewards/semantic_correctness_reward_func/std": 0.21562719345092773,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 89
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 162.58482360839844,
      "completions/mean_terminated_length": 154.82432556152344,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "epoch": 0.030728522043446716,
      "grad_norm": 0.02302616834640503,
      "kl": 0.008588314056396484,
      "learning_rate": 9.980082651251175e-06,
      "loss": -0.0421,
      "num_tokens": 32084121.0,
      "reward": 0.43415945768356323,
      "reward_std": 0.07369165122509003,
      "rewards/gemini_judge_reward_func/mean": 0.1071428582072258,
      "rewards/gemini_judge_reward_func/std": 0.22389310598373413,
      "rewards/semantic_correctness_reward_func/mean": 0.45501139760017395,
      "rewards/semantic_correctness_reward_func/std": 0.21243339776992798,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 90
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 859.0,
      "completions/mean_length": 166.32144165039062,
      "completions/mean_terminated_length": 146.73971557617188,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.03106995006615168,
      "grad_norm": 0.02333798073232174,
      "kl": 0.014536380767822266,
      "learning_rate": 9.97858104436822e-06,
      "loss": -0.0453,
      "num_tokens": 32455833.0,
      "reward": 0.38213050365448,
      "reward_std": 0.05550408363342285,
      "rewards/gemini_judge_reward_func/mean": 0.0915178582072258,
      "rewards/gemini_judge_reward_func/std": 0.21174997091293335,
      "rewards/semantic_correctness_reward_func/mean": 0.40486663579940796,
      "rewards/semantic_correctness_reward_func/std": 0.2065107375383377,
      "rewards/xmlcount_reward_func/mean": 0.6613750457763672,
      "rewards/xmlcount_reward_func/std": 0.47500187158584595,
      "step": 91
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 256.0,
      "completions/mean_length": 160.05804443359375,
      "completions/mean_terminated_length": 140.3333282470703,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.03141137808885664,
      "grad_norm": 0.023909490555524826,
      "kl": 0.010914802551269531,
      "learning_rate": 9.977024992520604e-06,
      "loss": -0.0204,
      "num_tokens": 32794082.0,
      "reward": 0.4354327321052551,
      "reward_std": 0.06427149474620819,
      "rewards/gemini_judge_reward_func/mean": 0.1350446492433548,
      "rewards/gemini_judge_reward_func/std": 0.23890070617198944,
      "rewards/semantic_correctness_reward_func/mean": 0.4413240849971771,
      "rewards/semantic_correctness_reward_func/std": 0.20441272854804993,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 92
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 809.0,
      "completions/mean_length": 164.6875,
      "completions/mean_terminated_length": 149.06362915039062,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.03175280611156161,
      "grad_norm": 0.023738721385598183,
      "kl": 0.0113372802734375,
      "learning_rate": 9.975414512725058e-06,
      "loss": -0.027,
      "num_tokens": 33150324.0,
      "reward": 0.41209447383880615,
      "reward_std": 0.08080209791660309,
      "rewards/gemini_judge_reward_func/mean": 0.1383928507566452,
      "rewards/gemini_judge_reward_func/std": 0.23637604713439941,
      "rewards/semantic_correctness_reward_func/mean": 0.4341597557067871,
      "rewards/semantic_correctness_reward_func/std": 0.2263120412826538,
      "rewards/xmlcount_reward_func/mean": 0.6747633814811707,
      "rewards/xmlcount_reward_func/std": 0.4702146351337433,
      "step": 93
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 932.0,
      "completions/mean_length": 170.7857208251953,
      "completions/mean_terminated_length": 151.30592346191406,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.03209423413426657,
      "grad_norm": 0.024146920070052147,
      "kl": 0.011864662170410156,
      "learning_rate": 9.973749622593534e-06,
      "loss": 0.0014,
      "num_tokens": 33538188.0,
      "reward": 0.411119282245636,
      "reward_std": 0.057556502521038055,
      "rewards/gemini_judge_reward_func/mean": 0.1171875,
      "rewards/gemini_judge_reward_func/std": 0.22571587562561035,
      "rewards/semantic_correctness_reward_func/mean": 0.4269711673259735,
      "rewards/semantic_correctness_reward_func/std": 0.21064431965351105,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903892993927,
      "step": 94
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 664.0,
      "completions/mean_length": 160.40179443359375,
      "completions/mean_terminated_length": 156.52915954589844,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.03243566215697153,
      "grad_norm": 0.024719731882214546,
      "kl": 0.010405540466308594,
      "learning_rate": 9.972030340333e-06,
      "loss": -0.0024,
      "num_tokens": 33882934.0,
      "reward": 0.44124558568000793,
      "reward_std": 0.07164882123470306,
      "rewards/gemini_judge_reward_func/mean": 0.1160714253783226,
      "rewards/gemini_judge_reward_func/std": 0.22814461588859558,
      "rewards/semantic_correctness_reward_func/mean": 0.4279598593711853,
      "rewards/semantic_correctness_reward_func/std": 0.19526031613349915,
      "rewards/xmlcount_reward_func/mean": 0.7730625867843628,
      "rewards/xmlcount_reward_func/std": 0.4180354177951813,
      "step": 95
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 418.0,
      "completions/mean_length": 156.02679443359375,
      "completions/mean_terminated_length": 144.24435424804688,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.0327770901796765,
      "grad_norm": 0.02348247356712818,
      "kl": 0.009704351425170898,
      "learning_rate": 9.970256684745258e-06,
      "loss": -0.0143,
      "num_tokens": 34226852.0,
      "reward": 0.43676093220710754,
      "reward_std": 0.06913831830024719,
      "rewards/gemini_judge_reward_func/mean": 0.1350446492433548,
      "rewards/gemini_judge_reward_func/std": 0.2558937668800354,
      "rewards/semantic_correctness_reward_func/mean": 0.46584004163742065,
      "rewards/semantic_correctness_reward_func/std": 0.2180854231119156,
      "rewards/xmlcount_reward_func/mean": 0.7239375114440918,
      "rewards/xmlcount_reward_func/std": 0.4488600790500641,
      "step": 96
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 756.0,
      "completions/mean_length": 185.54019165039062,
      "completions/mean_terminated_length": 158.4930877685547,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.03311851820238146,
      "grad_norm": 0.02116106078028679,
      "kl": 0.008137702941894531,
      "learning_rate": 9.968428675226714e-06,
      "loss": 0.0093,
      "num_tokens": 34568397.0,
      "reward": 0.4413779377937317,
      "reward_std": 0.05757666751742363,
      "rewards/gemini_judge_reward_func/mean": 0.1138392835855484,
      "rewards/gemini_judge_reward_func/std": 0.22927159070968628,
      "rewards/semantic_correctness_reward_func/mean": 0.42639848589897156,
      "rewards/semantic_correctness_reward_func/std": 0.20541919767856598,
      "rewards/xmlcount_reward_func/mean": 0.7764062881469727,
      "rewards/xmlcount_reward_func/std": 0.4174662232398987,
      "step": 97
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 772.0,
      "completions/mean_length": 153.9732208251953,
      "completions/mean_terminated_length": 146.1351318359375,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.033459946225086426,
      "grad_norm": 0.02458954229950905,
      "kl": 0.012489795684814453,
      "learning_rate": 9.966546331768192e-06,
      "loss": -0.029,
      "num_tokens": 34916131.0,
      "reward": 0.4533292353153229,
      "reward_std": 0.05946576967835426,
      "rewards/gemini_judge_reward_func/mean": 0.1540178507566452,
      "rewards/gemini_judge_reward_func/std": 0.2796248495578766,
      "rewards/semantic_correctness_reward_func/mean": 0.45711034536361694,
      "rewards/semantic_correctness_reward_func/std": 0.23785069584846497,
      "rewards/xmlcount_reward_func/mean": 0.7507500648498535,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 98
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 694.0,
      "completions/mean_length": 152.41519165039062,
      "completions/mean_terminated_length": 144.5630645751953,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.03380137424779139,
      "grad_norm": 0.02564327046275139,
      "kl": 0.01104736328125,
      "learning_rate": 9.964609674954696e-06,
      "loss": -0.0008,
      "num_tokens": 35263604.0,
      "reward": 0.45329129695892334,
      "reward_std": 0.08842761814594269,
      "rewards/gemini_judge_reward_func/mean": 0.1540178507566452,
      "rewards/gemini_judge_reward_func/std": 0.26096048951148987,
      "rewards/semantic_correctness_reward_func/mean": 0.45692071318626404,
      "rewards/semantic_correctness_reward_func/std": 0.2106001377105713,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 99
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 984.0,
      "completions/mean_length": 161.0803680419922,
      "completions/mean_terminated_length": 137.3302764892578,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.034142802270496354,
      "grad_norm": 0.02536987140774727,
      "kl": 0.011751174926757812,
      "learning_rate": 9.962618725965196e-06,
      "loss": -0.0198,
      "num_tokens": 35630690.0,
      "reward": 0.4004952311515808,
      "reward_std": 0.06513310968875885,
      "rewards/gemini_judge_reward_func/mean": 0.1339285671710968,
      "rewards/gemini_judge_reward_func/std": 0.26027876138687134,
      "rewards/semantic_correctness_reward_func/mean": 0.44761887192726135,
      "rewards/semantic_correctness_reward_func/std": 0.19793058931827545,
      "rewards/xmlcount_reward_func/mean": 0.643500030040741,
      "rewards/xmlcount_reward_func/std": 0.48071083426475525,
      "step": 100
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 964.0,
      "completions/mean_length": 160.3928680419922,
      "completions/mean_terminated_length": 144.69090270996094,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.034484230293201315,
      "grad_norm": 0.024176469072699547,
      "kl": 0.01227426528930664,
      "learning_rate": 9.960573506572391e-06,
      "loss": -0.0191,
      "num_tokens": 36018366.0,
      "reward": 0.3939872682094574,
      "reward_std": 0.0706588476896286,
      "rewards/gemini_judge_reward_func/mean": 0.1227678582072258,
      "rewards/gemini_judge_reward_func/std": 0.22830908000469208,
      "rewards/semantic_correctness_reward_func/mean": 0.4105878472328186,
      "rewards/semantic_correctness_reward_func/std": 0.19481845200061798,
      "rewards/xmlcount_reward_func/mean": 0.656906247138977,
      "rewards/xmlcount_reward_func/std": 0.47649866342544556,
      "step": 101
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 351.0,
      "completions/max_terminated_length": 351.0,
      "completions/mean_length": 153.43304443359375,
      "completions/mean_terminated_length": 153.43304443359375,
      "completions/min_length": 46.0,
      "completions/min_terminated_length": 46.0,
      "epoch": 0.034825658315906276,
      "grad_norm": 0.025619763880968094,
      "kl": 0.00950765609741211,
      "learning_rate": 9.95847403914247e-06,
      "loss": -0.0065,
      "num_tokens": 36359395.0,
      "reward": 0.44875597953796387,
      "reward_std": 0.05499029532074928,
      "rewards/gemini_judge_reward_func/mean": 0.0770089253783226,
      "rewards/gemini_judge_reward_func/std": 0.16035409271717072,
      "rewards/semantic_correctness_reward_func/mean": 0.3737618029117584,
      "rewards/semantic_correctness_reward_func/std": 0.17840272188186646,
      "rewards/xmlcount_reward_func/mean": 0.8580000996589661,
      "rewards/xmlcount_reward_func/std": 0.35106155276298523,
      "step": 102
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 812.0,
      "completions/mean_length": 162.66519165039062,
      "completions/mean_terminated_length": 150.9728546142578,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.03516708633861124,
      "grad_norm": 0.022210268303751945,
      "kl": 0.01260066032409668,
      "learning_rate": 9.956320346634877e-06,
      "loss": -0.0227,
      "num_tokens": 36727936.0,
      "reward": 0.38284486532211304,
      "reward_std": 0.05238700285553932,
      "rewards/gemini_judge_reward_func/mean": 0.0993303582072258,
      "rewards/gemini_judge_reward_func/std": 0.2006424516439438,
      "rewards/semantic_correctness_reward_func/mean": 0.3570633828639984,
      "rewards/semantic_correctness_reward_func/std": 0.17993000149726868,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 103
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 650.0,
      "completions/mean_length": 148.9866180419922,
      "completions/mean_terminated_length": 145.0627899169922,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.035508514361316204,
      "grad_norm": 0.024317806586623192,
      "kl": 0.011048316955566406,
      "learning_rate": 9.954112452602045e-06,
      "loss": 0.004,
      "num_tokens": 37079877.0,
      "reward": 0.43635034561157227,
      "reward_std": 0.061006706207990646,
      "rewards/gemini_judge_reward_func/mean": 0.1227678582072258,
      "rewards/gemini_judge_reward_func/std": 0.22334477305412292,
      "rewards/semantic_correctness_reward_func/mean": 0.4347156882286072,
      "rewards/semantic_correctness_reward_func/std": 0.18455860018730164,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 104
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 297.0,
      "completions/mean_length": 146.26339721679688,
      "completions/mean_terminated_length": 138.35586547851562,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.03584994238402117,
      "grad_norm": 0.024815207347273827,
      "kl": 0.012839555740356445,
      "learning_rate": 9.951850381189152e-06,
      "loss": -0.0026,
      "num_tokens": 37415952.0,
      "reward": 0.44215184450149536,
      "reward_std": 0.072935089468956,
      "rewards/gemini_judge_reward_func/mean": 0.1361607164144516,
      "rewards/gemini_judge_reward_func/std": 0.22804586589336395,
      "rewards/semantic_correctness_reward_func/mean": 0.44141072034835815,
      "rewards/semantic_correctness_reward_func/std": 0.20808282494544983,
      "rewards/xmlcount_reward_func/mean": 0.748513400554657,
      "rewards/xmlcount_reward_func/std": 0.43441200256347656,
      "step": 105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 570.0,
      "completions/max_terminated_length": 570.0,
      "completions/mean_length": 146.9866180419922,
      "completions/mean_terminated_length": 146.9866180419922,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.03619137040672613,
      "grad_norm": 0.024026190862059593,
      "kl": 0.011193037033081055,
      "learning_rate": 9.949534157133844e-06,
      "loss": -0.031,
      "num_tokens": 37739521.0,
      "reward": 0.4355818033218384,
      "reward_std": 0.06153957173228264,
      "rewards/gemini_judge_reward_func/mean": 0.1205357164144516,
      "rewards/gemini_judge_reward_func/std": 0.225807324051857,
      "rewards/semantic_correctness_reward_func/mean": 0.4353373646736145,
      "rewards/semantic_correctness_reward_func/std": 0.20193377137184143,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 106
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1015.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 155.53125,
      "completions/mean_terminated_length": 155.53125,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.03653279842943109,
      "grad_norm": 0.024166366085410118,
      "kl": 0.014576911926269531,
      "learning_rate": 9.94716380576598e-06,
      "loss": -0.0241,
      "num_tokens": 38103460.0,
      "reward": 0.40730592608451843,
      "reward_std": 0.05648142471909523,
      "rewards/gemini_judge_reward_func/mean": 0.0982142835855484,
      "rewards/gemini_judge_reward_func/std": 0.21662190556526184,
      "rewards/semantic_correctness_reward_func/mean": 0.4013420045375824,
      "rewards/semantic_correctness_reward_func/std": 0.19534234702587128,
      "rewards/xmlcount_reward_func/mean": 0.7193795442581177,
      "rewards/xmlcount_reward_func/std": 0.45101282000541687,
      "step": 107
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 666.0,
      "completions/mean_length": 153.33929443359375,
      "completions/mean_terminated_length": 145.49549865722656,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "epoch": 0.03687422645213606,
      "grad_norm": 0.023334842175245285,
      "kl": 0.010807275772094727,
      "learning_rate": 9.944739353007344e-06,
      "loss": -0.0011,
      "num_tokens": 38444604.0,
      "reward": 0.44895491003990173,
      "reward_std": 0.0724000632762909,
      "rewards/gemini_judge_reward_func/mean": 0.1450892835855484,
      "rewards/gemini_judge_reward_func/std": 0.2784051299095154,
      "rewards/semantic_correctness_reward_func/mean": 0.45309582352638245,
      "rewards/semantic_correctness_reward_func/std": 0.2039255052804947,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 108
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 367.0,
      "completions/mean_length": 154.91519165039062,
      "completions/mean_terminated_length": 147.08558654785156,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.03721565447484102,
      "grad_norm": 0.023959942162036896,
      "kl": 0.013292789459228516,
      "learning_rate": 9.942260825371359e-06,
      "loss": -0.0149,
      "num_tokens": 38779941.0,
      "reward": 0.44470787048339844,
      "reward_std": 0.057739898562431335,
      "rewards/gemini_judge_reward_func/mean": 0.1049107164144516,
      "rewards/gemini_judge_reward_func/std": 0.21080242097377777,
      "rewards/semantic_correctness_reward_func/mean": 0.44071775674819946,
      "rewards/semantic_correctness_reward_func/std": 0.1816713809967041,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 109
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 865.0,
      "completions/mean_length": 152.68304443359375,
      "completions/mean_terminated_length": 144.83334350585938,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.03755708249754599,
      "grad_norm": 0.024471363052725792,
      "kl": 0.014130592346191406,
      "learning_rate": 9.939728249962808e-06,
      "loss": -0.0143,
      "num_tokens": 39145830.0,
      "reward": 0.38968026638031006,
      "reward_std": 0.059608135372400284,
      "rewards/gemini_judge_reward_func/mean": 0.1015625,
      "rewards/gemini_judge_reward_func/std": 0.21571606397628784,
      "rewards/semantic_correctness_reward_func/mean": 0.3867761790752411,
      "rewards/semantic_correctness_reward_func/std": 0.2074183076620102,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 110
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 956.0,
      "completions/mean_length": 151.26339721679688,
      "completions/mean_terminated_length": 147.3497772216797,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.03789851052025095,
      "grad_norm": 0.02474793791770935,
      "kl": 0.012769222259521484,
      "learning_rate": 9.937141654477529e-06,
      "loss": -0.0224,
      "num_tokens": 39505753.0,
      "reward": 0.40522071719169617,
      "reward_std": 0.06155985966324806,
      "rewards/gemini_judge_reward_func/mean": 0.1060267835855484,
      "rewards/gemini_judge_reward_func/std": 0.22875528037548065,
      "rewards/semantic_correctness_reward_func/mean": 0.4198000431060791,
      "rewards/semantic_correctness_reward_func/std": 0.1814102977514267,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903594970703,
      "step": 111
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 463.0,
      "completions/mean_length": 151.0803680419922,
      "completions/mean_terminated_length": 139.23077392578125,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.03823993854295591,
      "grad_norm": 0.02324873022735119,
      "kl": 0.01199030876159668,
      "learning_rate": 9.934501067202117e-06,
      "loss": -0.0208,
      "num_tokens": 39904871.0,
      "reward": 0.411319375038147,
      "reward_std": 0.0837107002735138,
      "rewards/gemini_judge_reward_func/mean": 0.1316964328289032,
      "rewards/gemini_judge_reward_func/std": 0.2739836871623993,
      "rewards/semantic_correctness_reward_func/mean": 0.45257896184921265,
      "rewards/semantic_correctness_reward_func/std": 0.22245247662067413,
      "rewards/xmlcount_reward_func/mean": 0.6703125238418579,
      "rewards/xmlcount_reward_func/std": 0.4718664884567261,
      "step": 112
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 271.0,
      "completions/mean_length": 156.05804443359375,
      "completions/mean_terminated_length": 144.27603149414062,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.03858136656566088,
      "grad_norm": 0.025134863331913948,
      "kl": 0.011367321014404297,
      "learning_rate": 9.931806517013612e-06,
      "loss": 0.0098,
      "num_tokens": 40254420.0,
      "reward": 0.43234628438949585,
      "reward_std": 0.08657613396644592,
      "rewards/gemini_judge_reward_func/mean": 0.1272321492433548,
      "rewards/gemini_judge_reward_func/std": 0.24603478610515594,
      "rewards/semantic_correctness_reward_func/mean": 0.4244635999202728,
      "rewards/semantic_correctness_reward_func/std": 0.20944832265377045,
      "rewards/xmlcount_reward_func/mean": 0.7414017915725708,
      "rewards/xmlcount_reward_func/std": 0.4350353181362152,
      "step": 113
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 878.0,
      "completions/mean_length": 158.1607208251953,
      "completions/mean_terminated_length": 150.36036682128906,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.03892279458836584,
      "grad_norm": 0.023998796939849854,
      "kl": 0.015682697296142578,
      "learning_rate": 9.929058033379181e-06,
      "loss": -0.0196,
      "num_tokens": 40596484.0,
      "reward": 0.415109246969223,
      "reward_std": 0.05301095172762871,
      "rewards/gemini_judge_reward_func/mean": 0.0703125,
      "rewards/gemini_judge_reward_func/std": 0.16341476142406464,
      "rewards/semantic_correctness_reward_func/mean": 0.39767107367515564,
      "rewards/semantic_correctness_reward_func/std": 0.1908901482820511,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 114
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 419.0,
      "completions/mean_length": 152.52232360839844,
      "completions/mean_terminated_length": 140.6923065185547,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.039264222611070806,
      "grad_norm": 0.02244039997458458,
      "kl": 0.013239383697509766,
      "learning_rate": 9.926255646355804e-06,
      "loss": 0.0074,
      "num_tokens": 40948209.0,
      "reward": 0.43548721075057983,
      "reward_std": 0.06909479200839996,
      "rewards/gemini_judge_reward_func/mean": 0.171875,
      "rewards/gemini_judge_reward_func/std": 0.2720773220062256,
      "rewards/semantic_correctness_reward_func/mean": 0.4166412055492401,
      "rewards/semantic_correctness_reward_func/std": 0.2098981738090515,
      "rewards/xmlcount_reward_func/mean": 0.7085223197937012,
      "rewards/xmlcount_reward_func/std": 0.4550124406814575,
      "step": 115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 405.0,
      "completions/mean_length": 161.6294708251953,
      "completions/mean_terminated_length": 153.86036682128906,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.03960565063377577,
      "grad_norm": 0.022398851811885834,
      "kl": 0.010973930358886719,
      "learning_rate": 9.923399386589933e-06,
      "loss": -0.0155,
      "num_tokens": 41311642.0,
      "reward": 0.4142858684062958,
      "reward_std": 0.05273973196744919,
      "rewards/gemini_judge_reward_func/mean": 0.1361607164144516,
      "rewards/gemini_judge_reward_func/std": 0.2514272928237915,
      "rewards/semantic_correctness_reward_func/mean": 0.4406077563762665,
      "rewards/semantic_correctness_reward_func/std": 0.2008506804704666,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 323.0,
      "completions/mean_length": 150.70982360839844,
      "completions/mean_terminated_length": 146.79373168945312,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "epoch": 0.03994707865648073,
      "grad_norm": 0.023609992116689682,
      "kl": 0.015387296676635742,
      "learning_rate": 9.920489285317169e-06,
      "loss": 0.0,
      "num_tokens": 41676577.0,
      "reward": 0.4282933175563812,
      "reward_std": 0.06423311680555344,
      "rewards/gemini_judge_reward_func/mean": 0.1049107164144516,
      "rewards/gemini_judge_reward_func/std": 0.19847622513771057,
      "rewards/semantic_correctness_reward_func/mean": 0.4301450550556183,
      "rewards/semantic_correctness_reward_func/std": 0.18929553031921387,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 117
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 480.0,
      "completions/mean_length": 165.3303680419922,
      "completions/mean_terminated_length": 149.71817016601562,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "epoch": 0.040288506679185696,
      "grad_norm": 0.024569395929574966,
      "kl": 0.012789726257324219,
      "learning_rate": 9.917525374361913e-06,
      "loss": 0.0035,
      "num_tokens": 42018127.0,
      "reward": 0.4590550661087036,
      "reward_std": 0.06488455832004547,
      "rewards/gemini_judge_reward_func/mean": 0.1484375,
      "rewards/gemini_judge_reward_func/std": 0.24146370589733124,
      "rewards/semantic_correctness_reward_func/mean": 0.47008776664733887,
      "rewards/semantic_correctness_reward_func/std": 0.2246067225933075,
      "rewards/xmlcount_reward_func/mean": 0.7641563415527344,
      "rewards/xmlcount_reward_func/std": 0.4263768792152405,
      "step": 118
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 273.0,
      "completions/mean_length": 143.3928680419922,
      "completions/mean_terminated_length": 139.44395446777344,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.040629934701890656,
      "grad_norm": 0.025357214733958244,
      "kl": 0.016598224639892578,
      "learning_rate": 9.91450768613702e-06,
      "loss": -0.0098,
      "num_tokens": 42389311.0,
      "reward": 0.39798423647880554,
      "reward_std": 0.06020635738968849,
      "rewards/gemini_judge_reward_func/mean": 0.0970982164144516,
      "rewards/gemini_judge_reward_func/std": 0.22159849107265472,
      "rewards/semantic_correctness_reward_func/mean": 0.4014746844768524,
      "rewards/semantic_correctness_reward_func/std": 0.19921056926250458,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903594970703,
      "step": 119
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 779.0,
      "completions/mean_length": 163.8616180419922,
      "completions/mean_terminated_length": 152.18553161621094,
      "completions/min_length": 36.0,
      "completions/min_terminated_length": 36.0,
      "epoch": 0.040971362724595624,
      "grad_norm": 0.02301410585641861,
      "kl": 0.012985706329345703,
      "learning_rate": 9.911436253643445e-06,
      "loss": -0.0195,
      "num_tokens": 42740136.0,
      "reward": 0.4232383370399475,
      "reward_std": 0.0674186572432518,
      "rewards/gemini_judge_reward_func/mean": 0.1272321492433548,
      "rewards/gemini_judge_reward_func/std": 0.2437458485364914,
      "rewards/semantic_correctness_reward_func/mean": 0.43172726035118103,
      "rewards/semantic_correctness_reward_func/std": 0.19425256550312042,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 120
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 594.0,
      "completions/mean_length": 155.0,
      "completions/mean_terminated_length": 143.20362854003906,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.041312790747300585,
      "grad_norm": 0.023077256977558136,
      "kl": 0.015984773635864258,
      "learning_rate": 9.908311110469881e-06,
      "loss": 0.0032,
      "num_tokens": 43095476.0,
      "reward": 0.4018684923648834,
      "reward_std": 0.06371606886386871,
      "rewards/gemini_judge_reward_func/mean": 0.1116071417927742,
      "rewards/gemini_judge_reward_func/std": 0.23037132620811462,
      "rewards/semantic_correctness_reward_func/mean": 0.4276280999183655,
      "rewards/semantic_correctness_reward_func/std": 0.2191450297832489,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853896975517273,
      "step": 121
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 451.0,
      "completions/mean_length": 156.51339721679688,
      "completions/mean_terminated_length": 148.6981964111328,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.041654218770005545,
      "grad_norm": 0.02264971099793911,
      "kl": 0.01440286636352539,
      "learning_rate": 9.905132290792395e-06,
      "loss": -0.0204,
      "num_tokens": 43427135.0,
      "reward": 0.4580499231815338,
      "reward_std": 0.06534235179424286,
      "rewards/gemini_judge_reward_func/mean": 0.1383928507566452,
      "rewards/gemini_judge_reward_func/std": 0.2535383701324463,
      "rewards/semantic_correctness_reward_func/mean": 0.44046369194984436,
      "rewards/semantic_correctness_reward_func/std": 0.21319182217121124,
      "rewards/xmlcount_reward_func/mean": 0.786500096321106,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 700.0,
      "completions/mean_length": 175.90179443359375,
      "completions/mean_terminated_length": 152.55963134765625,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.04199564679271051,
      "grad_norm": 0.0236386526376009,
      "kl": 0.013439178466796875,
      "learning_rate": 9.901899829374048e-06,
      "loss": -0.0155,
      "num_tokens": 43793809.0,
      "reward": 0.41775208711624146,
      "reward_std": 0.0679284930229187,
      "rewards/gemini_judge_reward_func/mean": 0.1049107164144516,
      "rewards/gemini_judge_reward_func/std": 0.24407413601875305,
      "rewards/semantic_correctness_reward_func/mean": 0.41318878531455994,
      "rewards/semantic_correctness_reward_func/std": 0.20918205380439758,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 123
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 605.0,
      "completions/mean_length": 159.7991180419922,
      "completions/mean_terminated_length": 140.0684814453125,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.042337074815415474,
      "grad_norm": 0.02372078038752079,
      "kl": 0.014812469482421875,
      "learning_rate": 9.89861376156452e-06,
      "loss": -0.0073,
      "num_tokens": 44142160.0,
      "reward": 0.4198228418827057,
      "reward_std": 0.06786998361349106,
      "rewards/gemini_judge_reward_func/mean": 0.1551339328289032,
      "rewards/gemini_judge_reward_func/std": 0.28249844908714294,
      "rewards/semantic_correctness_reward_func/mean": 0.4303463101387024,
      "rewards/semantic_correctness_reward_func/std": 0.21610724925994873,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 124
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 284.0,
      "completions/mean_length": 156.77679443359375,
      "completions/mean_terminated_length": 148.96397399902344,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.04267850283812044,
      "grad_norm": 0.024051638320088387,
      "kl": 0.014676809310913086,
      "learning_rate": 9.895274123299724e-06,
      "loss": -0.0017,
      "num_tokens": 44485938.0,
      "reward": 0.42380136251449585,
      "reward_std": 0.060133740305900574,
      "rewards/gemini_judge_reward_func/mean": 0.1116071417927742,
      "rewards/gemini_judge_reward_func/std": 0.2178659439086914,
      "rewards/semantic_correctness_reward_func/mean": 0.3942924439907074,
      "rewards/semantic_correctness_reward_func/std": 0.22006738185882568,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 923.0,
      "completions/mean_length": 168.0625,
      "completions/mean_terminated_length": 156.44345092773438,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.0430199308608254,
      "grad_norm": 0.023526515811681747,
      "kl": 0.012475728988647461,
      "learning_rate": 9.891880951101407e-06,
      "loss": 0.0064,
      "num_tokens": 44856932.0,
      "reward": 0.40895187854766846,
      "reward_std": 0.0454825833439827,
      "rewards/gemini_judge_reward_func/mean": 0.0870535746216774,
      "rewards/gemini_judge_reward_func/std": 0.19867786765098572,
      "rewards/semantic_correctness_reward_func/mean": 0.4026699662208557,
      "rewards/semantic_correctness_reward_func/std": 0.20043903589248657,
      "rewards/xmlcount_reward_func/mean": 0.7339910864830017,
      "rewards/xmlcount_reward_func/std": 0.44274044036865234,
      "step": 126
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 333.0,
      "completions/mean_length": 162.9375,
      "completions/mean_terminated_length": 147.2818145751953,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.04336135888353036,
      "grad_norm": 0.02356693334877491,
      "kl": 0.01137542724609375,
      "learning_rate": 9.888434282076759e-06,
      "loss": -0.0299,
      "num_tokens": 45207042.0,
      "reward": 0.42898255586624146,
      "reward_std": 0.05791693180799484,
      "rewards/gemini_judge_reward_func/mean": 0.0948660746216774,
      "rewards/gemini_judge_reward_func/std": 0.20417828857898712,
      "rewards/semantic_correctness_reward_func/mean": 0.38218027353286743,
      "rewards/semantic_correctness_reward_func/std": 0.1872968077659607,
      "rewards/xmlcount_reward_func/mean": 0.786500096321106,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 127
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 398.0,
      "completions/mean_length": 157.73214721679688,
      "completions/mean_terminated_length": 145.9728546142578,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.04370278690623533,
      "grad_norm": 0.023837530985474586,
      "kl": 0.013489961624145508,
      "learning_rate": 9.884934153917998e-06,
      "loss": -0.0031,
      "num_tokens": 45548426.0,
      "reward": 0.4394071698188782,
      "reward_std": 0.08121853321790695,
      "rewards/gemini_judge_reward_func/mean": 0.1116071417927742,
      "rewards/gemini_judge_reward_func/std": 0.23990675806999207,
      "rewards/semantic_correctness_reward_func/mean": 0.4495357573032379,
      "rewards/semantic_correctness_reward_func/std": 0.20210714638233185,
      "rewards/xmlcount_reward_func/mean": 0.7621428370475769,
      "rewards/xmlcount_reward_func/std": 0.483738511800766,
      "step": 128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 328.0,
      "completions/mean_length": 151.5803680419922,
      "completions/mean_terminated_length": 147.6681671142578,
      "completions/min_length": 40.0,
      "completions/min_terminated_length": 40.0,
      "epoch": 0.04404421492894029,
      "grad_norm": 0.024463778361678123,
      "kl": 0.014742374420166016,
      "learning_rate": 9.881380604901964e-06,
      "loss": -0.0051,
      "num_tokens": 45899380.0,
      "reward": 0.468868613243103,
      "reward_std": 0.06673333793878555,
      "rewards/gemini_judge_reward_func/mean": 0.1372767835855484,
      "rewards/gemini_judge_reward_func/std": 0.22798827290534973,
      "rewards/semantic_correctness_reward_func/mean": 0.46103930473327637,
      "rewards/semantic_correctness_reward_func/std": 0.2001686841249466,
      "rewards/xmlcount_reward_func/mean": 0.8043750524520874,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 129
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 428.0,
      "completions/mean_length": 169.2991180419922,
      "completions/mean_terminated_length": 141.7281036376953,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.04438564295164526,
      "grad_norm": 0.02389535680413246,
      "kl": 0.013713598251342773,
      "learning_rate": 9.877773673889702e-06,
      "loss": 0.0058,
      "num_tokens": 46278347.0,
      "reward": 0.4204433560371399,
      "reward_std": 0.0702306404709816,
      "rewards/gemini_judge_reward_func/mean": 0.1439732164144516,
      "rewards/gemini_judge_reward_func/std": 0.235991969704628,
      "rewards/semantic_correctness_reward_func/mean": 0.44755586981773376,
      "rewards/semantic_correctness_reward_func/std": 0.216790109872818,
      "rewards/xmlcount_reward_func/mean": 0.6833571791648865,
      "rewards/xmlcount_reward_func/std": 0.46658626198768616,
      "step": 130
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 400.0,
      "completions/mean_length": 155.3169708251953,
      "completions/mean_terminated_length": 151.42153930664062,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.04472707097435022,
      "grad_norm": 0.023298032581806183,
      "kl": 0.011845588684082031,
      "learning_rate": 9.874113400326031e-06,
      "loss": -0.0069,
      "num_tokens": 46584138.0,
      "reward": 0.5024296641349792,
      "reward_std": 0.07264265418052673,
      "rewards/gemini_judge_reward_func/mean": 0.1272321492433548,
      "rewards/gemini_judge_reward_func/std": 0.2560819387435913,
      "rewards/semantic_correctness_reward_func/mean": 0.4527016580104828,
      "rewards/semantic_correctness_reward_func/std": 0.20331206917762756,
      "rewards/xmlcount_reward_func/mean": 0.9024911522865295,
      "rewards/xmlcount_reward_func/std": 0.29851844906806946,
      "step": 131
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0357142857142857,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 552.0,
      "completions/mean_length": 183.74554443359375,
      "completions/mean_terminated_length": 152.625,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.04506849899705518,
      "grad_norm": 0.022067122161388397,
      "kl": 0.019008159637451172,
      "learning_rate": 9.870399824239116e-06,
      "loss": -0.0057,
      "num_tokens": 46960697.0,
      "reward": 0.3944554924964905,
      "reward_std": 0.060083404183387756,
      "rewards/gemini_judge_reward_func/mean": 0.0948660746216774,
      "rewards/gemini_judge_reward_func/std": 0.2082555890083313,
      "rewards/semantic_correctness_reward_func/mean": 0.4240451157093048,
      "rewards/semantic_correctness_reward_func/std": 0.2055032104253769,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 132
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 374.0,
      "completions/mean_length": 161.64732360839844,
      "completions/mean_terminated_length": 149.94117736816406,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.04540992701976015,
      "grad_norm": 0.023691849783062935,
      "kl": 0.011993408203125,
      "learning_rate": 9.86663298624003e-06,
      "loss": -0.0322,
      "num_tokens": 47315422.0,
      "reward": 0.48760873079299927,
      "reward_std": 0.07489056885242462,
      "rewards/gemini_judge_reward_func/mean": 0.1428571492433548,
      "rewards/gemini_judge_reward_func/std": 0.25981688499450684,
      "rewards/semantic_correctness_reward_func/mean": 0.43856117129325867,
      "rewards/semantic_correctness_reward_func/std": 0.21570587158203125,
      "rewards/xmlcount_reward_func/mean": 0.8568840026855469,
      "rewards/xmlcount_reward_func/std": 0.3510022759437561,
      "step": 133
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 688.0,
      "completions/mean_length": 167.0357208251953,
      "completions/mean_terminated_length": 155.40272521972656,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.04575135504246511,
      "grad_norm": 0.021651828661561012,
      "kl": 0.0135650634765625,
      "learning_rate": 9.86281292752231e-06,
      "loss": -0.0115,
      "num_tokens": 47674206.0,
      "reward": 0.4171735942363739,
      "reward_std": 0.0660734549164772,
      "rewards/gemini_judge_reward_func/mean": 0.1272321492433548,
      "rewards/gemini_judge_reward_func/std": 0.28012555837631226,
      "rewards/semantic_correctness_reward_func/mean": 0.4371536672115326,
      "rewards/semantic_correctness_reward_func/std": 0.21627959609031677,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903892993927,
      "step": 134
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 832.0,
      "completions/max_terminated_length": 832.0,
      "completions/mean_length": 149.97769165039062,
      "completions/mean_terminated_length": 149.97769165039062,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.046092783065170076,
      "grad_norm": 0.02360036037862301,
      "kl": 0.01648855209350586,
      "learning_rate": 9.858939689861506e-06,
      "loss": -0.0124,
      "num_tokens": 48012965.0,
      "reward": 0.44242221117019653,
      "reward_std": 0.07747457921504974,
      "rewards/gemini_judge_reward_func/mean": 0.1573660671710968,
      "rewards/gemini_judge_reward_func/std": 0.25402894616127014,
      "rewards/semantic_correctness_reward_func/mean": 0.44056612253189087,
      "rewards/semantic_correctness_reward_func/std": 0.23137415945529938,
      "rewards/xmlcount_reward_func/mean": 0.7284063100814819,
      "rewards/xmlcount_reward_func/std": 0.4465975761413574,
      "step": 135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 860.0,
      "completions/mean_length": 155.32589721679688,
      "completions/mean_terminated_length": 147.5,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.04643421108787504,
      "grad_norm": 0.022543828934431076,
      "kl": 0.0161590576171875,
      "learning_rate": 9.855013315614725e-06,
      "loss": -0.0333,
      "num_tokens": 48372202.0,
      "reward": 0.4096542000770569,
      "reward_std": 0.06795641779899597,
      "rewards/gemini_judge_reward_func/mean": 0.1104910746216774,
      "rewards/gemini_judge_reward_func/std": 0.23392580449581146,
      "rewards/semantic_correctness_reward_func/mean": 0.40622615814208984,
      "rewards/semantic_correctness_reward_func/std": 0.22462278604507446,
      "rewards/xmlcount_reward_func/mean": 0.7105312943458557,
      "rewards/xmlcount_reward_func/std": 0.4553159773349762,
      "step": 136
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 984.0,
      "completions/mean_length": 157.4375,
      "completions/mean_terminated_length": 153.55157470703125,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "epoch": 0.046775639110580004,
      "grad_norm": 0.021333523094654083,
      "kl": 0.012816905975341797,
      "learning_rate": 9.851033847720167e-06,
      "loss": -0.0072,
      "num_tokens": 48754868.0,
      "reward": 0.42013928294181824,
      "reward_std": 0.0754006877541542,
      "rewards/gemini_judge_reward_func/mean": 0.1305803507566452,
      "rewards/gemini_judge_reward_func/std": 0.27092286944389343,
      "rewards/semantic_correctness_reward_func/mean": 0.4452856183052063,
      "rewards/semantic_correctness_reward_func/std": 0.2089463472366333,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903892993927,
      "step": 137
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 159.84375,
      "completions/mean_terminated_length": 148.11312866210938,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.047117067133284965,
      "grad_norm": 0.02194381318986416,
      "kl": 0.011771917343139648,
      "learning_rate": 9.847001329696653e-06,
      "loss": 0.0087,
      "num_tokens": 49104497.0,
      "reward": 0.4554777145385742,
      "reward_std": 0.058295100927352905,
      "rewards/gemini_judge_reward_func/mean": 0.1305803507566452,
      "rewards/gemini_judge_reward_func/std": 0.22453762590885162,
      "rewards/semantic_correctness_reward_func/mean": 0.44322773814201355,
      "rewards/semantic_correctness_reward_func/std": 0.18111221492290497,
      "rewards/xmlcount_reward_func/mean": 0.786500096321106,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 138
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 468.0,
      "completions/mean_length": 150.88839721679688,
      "completions/mean_terminated_length": 146.9730987548828,
      "completions/min_length": 37.0,
      "completions/min_terminated_length": 37.0,
      "epoch": 0.047458495155989926,
      "grad_norm": 0.021744275465607643,
      "kl": 0.014683246612548828,
      "learning_rate": 9.842915805643156e-06,
      "loss": -0.0157,
      "num_tokens": 49473884.0,
      "reward": 0.4144614338874817,
      "reward_std": 0.06900496780872345,
      "rewards/gemini_judge_reward_func/mean": 0.1808035671710968,
      "rewards/gemini_judge_reward_func/std": 0.2961066961288452,
      "rewards/semantic_correctness_reward_func/mean": 0.447878360748291,
      "rewards/semantic_correctness_reward_func/std": 0.19721916317939758,
      "rewards/xmlcount_reward_func/mean": 0.6314107179641724,
      "rewards/xmlcount_reward_func/std": 0.48076197504997253,
      "step": 139
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 494.0,
      "completions/mean_length": 168.02679443359375,
      "completions/mean_terminated_length": 144.46788024902344,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.04779992317869489,
      "grad_norm": 0.021885110065340996,
      "kl": 0.015885353088378906,
      "learning_rate": 9.838777320238312e-06,
      "loss": -0.0297,
      "num_tokens": 49867834.0,
      "reward": 0.39027801156044006,
      "reward_std": 0.06917236000299454,
      "rewards/gemini_judge_reward_func/mean": 0.1272321492433548,
      "rewards/gemini_judge_reward_func/std": 0.24603478610515594,
      "rewards/semantic_correctness_reward_func/mean": 0.43455058336257935,
      "rewards/semantic_correctness_reward_func/std": 0.20599418878555298,
      "rewards/xmlcount_reward_func/mean": 0.6311875581741333,
      "rewards/xmlcount_reward_func/std": 0.48217126727104187,
      "step": 140
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 164.79464721679688,
      "completions/mean_terminated_length": 149.1727294921875,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.048141351201399854,
      "grad_norm": 0.021948551759123802,
      "kl": 0.014840126037597656,
      "learning_rate": 9.834585918739936e-06,
      "loss": 0.0182,
      "num_tokens": 50228388.0,
      "reward": 0.4530205726623535,
      "reward_std": 0.08578796684741974,
      "rewards/gemini_judge_reward_func/mean": 0.1796875,
      "rewards/gemini_judge_reward_func/std": 0.30927425622940063,
      "rewards/semantic_correctness_reward_func/mean": 0.4399777054786682,
      "rewards/semantic_correctness_reward_func/std": 0.2128545343875885,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 141
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 382.0,
      "completions/mean_length": 152.30357360839844,
      "completions/mean_terminated_length": 144.45045471191406,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.04848277922410482,
      "grad_norm": 0.02323133312165737,
      "kl": 0.015910625457763672,
      "learning_rate": 9.830341646984521e-06,
      "loss": 0.0111,
      "num_tokens": 50582596.0,
      "reward": 0.44822028279304504,
      "reward_std": 0.06913460791110992,
      "rewards/gemini_judge_reward_func/mean": 0.1607142835855484,
      "rewards/gemini_judge_reward_func/std": 0.27068495750427246,
      "rewards/semantic_correctness_reward_func/mean": 0.45392245054244995,
      "rewards/semantic_correctness_reward_func/std": 0.20474979281425476,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 142
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 279.0,
      "completions/mean_length": 167.79019165039062,
      "completions/mean_terminated_length": 148.24200439453125,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.04882420724680978,
      "grad_norm": 0.022231949493288994,
      "kl": 0.013165473937988281,
      "learning_rate": 9.826044551386743e-06,
      "loss": -0.0225,
      "num_tokens": 50930225.0,
      "reward": 0.44765713810920715,
      "reward_std": 0.06493684649467468,
      "rewards/gemini_judge_reward_func/mean": 0.1328125,
      "rewards/gemini_judge_reward_func/std": 0.26138925552368164,
      "rewards/semantic_correctness_reward_func/mean": 0.4354102909564972,
      "rewards/semantic_correctness_reward_func/std": 0.2049492746591568,
      "rewards/xmlcount_reward_func/mean": 0.7686249613761902,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 143
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 742.0,
      "completions/mean_length": 168.24107360839844,
      "completions/mean_terminated_length": 148.70318603515625,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.04916563526951474,
      "grad_norm": 0.02268117107450962,
      "kl": 0.01633310317993164,
      "learning_rate": 9.821694678938954e-06,
      "loss": -0.0143,
      "num_tokens": 51295311.0,
      "reward": 0.37321770191192627,
      "reward_std": 0.06530405580997467,
      "rewards/gemini_judge_reward_func/mean": 0.125,
      "rewards/gemini_judge_reward_func/std": 0.22083210945129395,
      "rewards/semantic_correctness_reward_func/mean": 0.4452759623527527,
      "rewards/semantic_correctness_reward_func/std": 0.19345305860042572,
      "rewards/xmlcount_reward_func/mean": 0.5854063034057617,
      "rewards/xmlcount_reward_func/std": 0.49435025453567505,
      "step": 144
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1004.0,
      "completions/mean_length": 162.8616180419922,
      "completions/mean_terminated_length": 151.1719512939453,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.04950706329221971,
      "grad_norm": 0.021489452570676804,
      "kl": 0.01521444320678711,
      "learning_rate": 9.817292077210658e-06,
      "loss": -0.012,
      "num_tokens": 51677272.0,
      "reward": 0.4057961404323578,
      "reward_std": 0.06886345148086548,
      "rewards/gemini_judge_reward_func/mean": 0.125,
      "rewards/gemini_judge_reward_func/std": 0.2516759932041168,
      "rewards/semantic_correctness_reward_func/mean": 0.4517931044101715,
      "rewards/semantic_correctness_reward_func/std": 0.20376348495483398,
      "rewards/xmlcount_reward_func/mean": 0.6635938286781311,
      "rewards/xmlcount_reward_func/std": 0.4730554521083832,
      "step": 145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 789.0,
      "completions/mean_length": 168.55804443359375,
      "completions/mean_terminated_length": 153.00454711914062,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.04984849131492467,
      "grad_norm": 0.023336565122008324,
      "kl": 0.012263774871826172,
      "learning_rate": 9.812836794348005e-06,
      "loss": -0.0348,
      "num_tokens": 52045293.0,
      "reward": 0.4521057605743408,
      "reward_std": 0.07398687303066254,
      "rewards/gemini_judge_reward_func/mean": 0.1462053507566452,
      "rewards/gemini_judge_reward_func/std": 0.25739532709121704,
      "rewards/semantic_correctness_reward_func/mean": 0.47555527091026306,
      "rewards/semantic_correctness_reward_func/std": 0.2144700139760971,
      "rewards/xmlcount_reward_func/mean": 0.7462812662124634,
      "rewards/xmlcount_reward_func/std": 0.4369716942310333,
      "step": 146
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 158.34375,
      "completions/mean_terminated_length": 154.46189880371094,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.05018991933762964,
      "grad_norm": 0.022060193121433258,
      "kl": 0.014934062957763672,
      "learning_rate": 9.808328879073251e-06,
      "loss": -0.0244,
      "num_tokens": 52393254.0,
      "reward": 0.42671334743499756,
      "reward_std": 0.06478478014469147,
      "rewards/gemini_judge_reward_func/mean": 0.1283482164144516,
      "rewards/gemini_judge_reward_func/std": 0.25933241844177246,
      "rewards/semantic_correctness_reward_func/mean": 0.41112020611763,
      "rewards/semantic_correctness_reward_func/std": 0.1945081204175949,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 147
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 446.0,
      "completions/mean_length": 164.73214721679688,
      "completions/mean_terminated_length": 141.0825653076172,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.0505313473603346,
      "grad_norm": 0.02345438115298748,
      "kl": 0.016699790954589844,
      "learning_rate": 9.803768380684242e-06,
      "loss": -0.0423,
      "num_tokens": 52763070.0,
      "reward": 0.39287370443344116,
      "reward_std": 0.0494573637843132,
      "rewards/gemini_judge_reward_func/mean": 0.125,
      "rewards/gemini_judge_reward_func/std": 0.26043254137039185,
      "rewards/semantic_correctness_reward_func/mean": 0.3916184902191162,
      "rewards/semantic_correctness_reward_func/std": 0.2100548893213272,
      "rewards/xmlcount_reward_func/mean": 0.6613750457763672,
      "rewards/xmlcount_reward_func/std": 0.47500187158584595,
      "step": 148
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 657.0,
      "completions/mean_length": 154.75,
      "completions/mean_terminated_length": 146.91893005371094,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.05087277538303956,
      "grad_norm": 0.02226773090660572,
      "kl": 0.015322446823120117,
      "learning_rate": 9.79915534905385e-06,
      "loss": -0.0125,
      "num_tokens": 53114594.0,
      "reward": 0.45522111654281616,
      "reward_std": 0.07392917573451996,
      "rewards/gemini_judge_reward_func/mean": 0.1852678507566452,
      "rewards/gemini_judge_reward_func/std": 0.28459224104881287,
      "rewards/semantic_correctness_reward_func/mean": 0.4398198127746582,
      "rewards/semantic_correctness_reward_func/std": 0.2161870151758194,
      "rewards/xmlcount_reward_func/mean": 0.7328750491142273,
      "rewards/xmlcount_reward_func/std": 0.44427841901779175,
      "step": 149
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 404.0,
      "completions/mean_length": 166.0803680419922,
      "completions/mean_terminated_length": 150.4818115234375,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "epoch": 0.05121420340574453,
      "grad_norm": 0.02213941141963005,
      "kl": 0.014892578125,
      "learning_rate": 9.794489834629457e-06,
      "loss": 0.0032,
      "num_tokens": 53466324.0,
      "reward": 0.4585185945034027,
      "reward_std": 0.08465974032878876,
      "rewards/gemini_judge_reward_func/mean": 0.140625,
      "rewards/gemini_judge_reward_func/std": 0.27259188890457153,
      "rewards/semantic_correctness_reward_func/mean": 0.4383428394794464,
      "rewards/semantic_correctness_reward_func/std": 0.21441145241260529,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 150
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 307.0,
      "completions/mean_length": 146.16964721679688,
      "completions/mean_terminated_length": 138.26126098632812,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.05155563142844949,
      "grad_norm": 0.02260763570666313,
      "kl": 0.014892101287841797,
      "learning_rate": 9.789771888432375e-06,
      "loss": 0.0059,
      "num_tokens": 53833862.0,
      "reward": 0.38839536905288696,
      "reward_std": 0.06746162474155426,
      "rewards/gemini_judge_reward_func/mean": 0.1272321492433548,
      "rewards/gemini_judge_reward_func/std": 0.24830262362957,
      "rewards/semantic_correctness_reward_func/mean": 0.4273248314857483,
      "rewards/semantic_correctness_reward_func/std": 0.20607294142246246,
      "rewards/xmlcount_reward_func/mean": 0.6300938725471497,
      "rewards/xmlcount_reward_func/std": 0.48451390862464905,
      "step": 151
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 458.0,
      "completions/mean_length": 168.9732208251953,
      "completions/mean_terminated_length": 149.45204162597656,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.051897059451154456,
      "grad_norm": 0.022246094420552254,
      "kl": 0.013828754425048828,
      "learning_rate": 9.785001562057311e-06,
      "loss": -0.0048,
      "num_tokens": 54173988.0,
      "reward": 0.424908310174942,
      "reward_std": 0.06946201622486115,
      "rewards/gemini_judge_reward_func/mean": 0.1350446492433548,
      "rewards/gemini_judge_reward_func/std": 0.2602379024028778,
      "rewards/semantic_correctness_reward_func/mean": 0.4132825434207916,
      "rewards/semantic_correctness_reward_func/std": 0.20428363978862762,
      "rewards/xmlcount_reward_func/mean": 0.7205848693847656,
      "rewards/xmlcount_reward_func/std": 0.44707995653152466,
      "step": 152
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 859.0,
      "completions/mean_length": 160.70982360839844,
      "completions/mean_terminated_length": 141.0,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.05223848747385942,
      "grad_norm": 0.022493815049529076,
      "kl": 0.015873432159423828,
      "learning_rate": 9.780178907671788e-06,
      "loss": -0.018,
      "num_tokens": 54559591.0,
      "reward": 0.36641088128089905,
      "reward_std": 0.05886705592274666,
      "rewards/gemini_judge_reward_func/mean": 0.0725446417927742,
      "rewards/gemini_judge_reward_func/std": 0.17885597050189972,
      "rewards/semantic_correctness_reward_func/mean": 0.400027334690094,
      "rewards/semantic_correctness_reward_func/std": 0.185530886054039,
      "rewards/xmlcount_reward_func/mean": 0.6434687376022339,
      "rewards/xmlcount_reward_func/std": 0.4783778190612793,
      "step": 153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 543.0,
      "completions/mean_length": 163.7857208251953,
      "completions/mean_terminated_length": 152.1085968017578,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.05257991549656438,
      "grad_norm": 0.02088047005236149,
      "kl": 0.013623714447021484,
      "learning_rate": 9.775303978015585e-06,
      "loss": 0.003,
      "num_tokens": 54911635.0,
      "reward": 0.47129741311073303,
      "reward_std": 0.0827810987830162,
      "rewards/gemini_judge_reward_func/mean": 0.1551339328289032,
      "rewards/gemini_judge_reward_func/std": 0.26509660482406616,
      "rewards/semantic_correctness_reward_func/mean": 0.46478161215782166,
      "rewards/semantic_correctness_reward_func/std": 0.22101202607154846,
      "rewards/xmlcount_reward_func/mean": 0.7907187342643738,
      "rewards/xmlcount_reward_func/std": 0.40539026260375977,
      "step": 154
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 937.0,
      "completions/mean_length": 170.61607360839844,
      "completions/mean_terminated_length": 147.12843322753906,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.052921343519269345,
      "grad_norm": 0.021672114729881287,
      "kl": 0.016643762588500977,
      "learning_rate": 9.77037682640015e-06,
      "loss": -0.0197,
      "num_tokens": 55285241.0,
      "reward": 0.38316836953163147,
      "reward_std": 0.05640144646167755,
      "rewards/gemini_judge_reward_func/mean": 0.1205357164144516,
      "rewards/gemini_judge_reward_func/std": 0.2169451117515564,
      "rewards/semantic_correctness_reward_func/mean": 0.42352014780044556,
      "rewards/semantic_correctness_reward_func/std": 0.21736378967761993,
      "rewards/xmlcount_reward_func/mean": 0.6256250143051147,
      "rewards/xmlcount_reward_func/std": 0.48569241166114807,
      "step": 155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 305.0,
      "completions/mean_length": 158.07144165039062,
      "completions/mean_terminated_length": 154.1883544921875,
      "completions/min_length": 58.0,
      "completions/min_terminated_length": 58.0,
      "epoch": 0.053262771541974306,
      "grad_norm": 0.021647725254297256,
      "kl": 0.011748313903808594,
      "learning_rate": 9.765397506708023e-06,
      "loss": 0.0024,
      "num_tokens": 55603469.0,
      "reward": 0.4995449483394623,
      "reward_std": 0.06327866017818451,
      "rewards/gemini_judge_reward_func/mean": 0.1227678582072258,
      "rewards/gemini_judge_reward_func/std": 0.22953340411186218,
      "rewards/semantic_correctness_reward_func/mean": 0.4289388656616211,
      "rewards/semantic_correctness_reward_func/std": 0.19935138523578644,
      "rewards/xmlcount_reward_func/mean": 0.9116250276565552,
      "rewards/xmlcount_reward_func/std": 0.28608015179634094,
      "step": 156
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 740.0,
      "completions/mean_length": 167.4732208251953,
      "completions/mean_terminated_length": 155.84616088867188,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.053604199564679274,
      "grad_norm": 0.0213526152074337,
      "kl": 0.01268625259399414,
      "learning_rate": 9.760366073392246e-06,
      "loss": -0.0418,
      "num_tokens": 55956707.0,
      "reward": 0.44403091073036194,
      "reward_std": 0.07015395164489746,
      "rewards/gemini_judge_reward_func/mean": 0.1473214328289032,
      "rewards/gemini_judge_reward_func/std": 0.24616695940494537,
      "rewards/semantic_correctness_reward_func/mean": 0.42401161789894104,
      "rewards/semantic_correctness_reward_func/std": 0.20338886976242065,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 157
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 468.0,
      "completions/mean_length": 153.09375,
      "completions/mean_terminated_length": 149.1883544921875,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.053945627587384234,
      "grad_norm": 0.022866856306791306,
      "kl": 0.013154983520507812,
      "learning_rate": 9.755282581475769e-06,
      "loss": -0.0114,
      "num_tokens": 56308040.0,
      "reward": 0.4244222044944763,
      "reward_std": 0.06508694589138031,
      "rewards/gemini_judge_reward_func/mean": 0.1071428582072258,
      "rewards/gemini_judge_reward_func/std": 0.20562243461608887,
      "rewards/semantic_correctness_reward_func/mean": 0.44207531213760376,
      "rewards/semantic_correctness_reward_func/std": 0.21528586745262146,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 158
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 607.0,
      "completions/mean_length": 166.02232360839844,
      "completions/mean_terminated_length": 154.37557983398438,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.054287055610089195,
      "grad_norm": 0.020635830238461494,
      "kl": 0.014980077743530273,
      "learning_rate": 9.750147086550843e-06,
      "loss": -0.0214,
      "num_tokens": 56668957.0,
      "reward": 0.41711556911468506,
      "reward_std": 0.052652522921562195,
      "rewards/gemini_judge_reward_func/mean": 0.0970982164144516,
      "rewards/gemini_judge_reward_func/std": 0.21123822033405304,
      "rewards/semantic_correctness_reward_func/mean": 0.3898812234401703,
      "rewards/semantic_correctness_reward_func/std": 0.1925191879272461,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 159
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 936.0,
      "completions/mean_length": 169.67857360839844,
      "completions/mean_terminated_length": 154.14544677734375,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.05462848363279416,
      "grad_norm": 0.02253010869026184,
      "kl": 0.014844894409179688,
      "learning_rate": 9.744959644778422e-06,
      "loss": -0.0133,
      "num_tokens": 57031605.0,
      "reward": 0.40882542729377747,
      "reward_std": 0.07083141058683395,
      "rewards/gemini_judge_reward_func/mean": 0.1305803507566452,
      "rewards/gemini_judge_reward_func/std": 0.2459813803434372,
      "rewards/semantic_correctness_reward_func/mean": 0.4065912067890167,
      "rewards/semantic_correctness_reward_func/std": 0.18288636207580566,
      "rewards/xmlcount_reward_func/mean": 0.6881875395774841,
      "rewards/xmlcount_reward_func/std": 0.46501508355140686,
      "step": 160
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 407.0,
      "completions/mean_length": 166.9553680419922,
      "completions/mean_terminated_length": 155.32127380371094,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.054969911655499124,
      "grad_norm": 0.021544380113482475,
      "kl": 0.013626575469970703,
      "learning_rate": 9.739720312887536e-06,
      "loss": -0.0282,
      "num_tokens": 57369867.0,
      "reward": 0.44754543900489807,
      "reward_std": 0.07036061584949493,
      "rewards/gemini_judge_reward_func/mean": 0.1194196417927742,
      "rewards/gemini_judge_reward_func/std": 0.24253910779953003,
      "rewards/semantic_correctness_reward_func/mean": 0.4258878231048584,
      "rewards/semantic_correctness_reward_func/std": 0.1886364221572876,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 161
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 684.0,
      "completions/mean_length": 164.01339721679688,
      "completions/mean_terminated_length": 144.3789825439453,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.05531133967820409,
      "grad_norm": 0.021095486357808113,
      "kl": 0.016514301300048828,
      "learning_rate": 9.734429148174676e-06,
      "loss": -0.0261,
      "num_tokens": 57756406.0,
      "reward": 0.35722407698631287,
      "reward_std": 0.07497614622116089,
      "rewards/gemini_judge_reward_func/mean": 0.1350446492433548,
      "rewards/gemini_judge_reward_func/std": 0.27490234375,
      "rewards/semantic_correctness_reward_func/mean": 0.40778082609176636,
      "rewards/semantic_correctness_reward_func/std": 0.1879914402961731,
      "rewards/xmlcount_reward_func/mean": 0.5541250109672546,
      "rewards/xmlcount_reward_func/std": 0.49873343110084534,
      "step": 162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 870.0,
      "completions/mean_length": 173.91519165039062,
      "completions/mean_terminated_length": 162.37557983398438,
      "completions/min_length": 51.0,
      "completions/min_terminated_length": 51.0,
      "epoch": 0.05565276770090905,
      "grad_norm": 0.022490430623292923,
      "kl": 0.0135345458984375,
      "learning_rate": 9.729086208503174e-06,
      "loss": -0.0096,
      "num_tokens": 58128191.0,
      "reward": 0.4325447976589203,
      "reward_std": 0.05169191583991051,
      "rewards/gemini_judge_reward_func/mean": 0.0892857164144516,
      "rewards/gemini_judge_reward_func/std": 0.17189638316631317,
      "rewards/semantic_correctness_reward_func/mean": 0.446902334690094,
      "rewards/semantic_correctness_reward_func/std": 0.18784944713115692,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 163
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 851.0,
      "completions/mean_length": 164.34375,
      "completions/mean_terminated_length": 152.67420959472656,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.05599419572361401,
      "grad_norm": 0.0230224821716547,
      "kl": 0.013494491577148438,
      "learning_rate": 9.723691552302563e-06,
      "loss": -0.008,
      "num_tokens": 58458552.0,
      "reward": 0.484989196062088,
      "reward_std": 0.07875213027000427,
      "rewards/gemini_judge_reward_func/mean": 0.1506696492433548,
      "rewards/gemini_judge_reward_func/std": 0.2547961473464966,
      "rewards/semantic_correctness_reward_func/mean": 0.44335657358169556,
      "rewards/semantic_correctness_reward_func/std": 0.19373531639575958,
      "rewards/xmlcount_reward_func/mean": 0.8401250839233398,
      "rewards/xmlcount_reward_func/std": 0.3684578537940979,
      "step": 164
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 323.0,
      "completions/mean_length": 150.0669708251953,
      "completions/mean_terminated_length": 138.20362854003906,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.05633562374631898,
      "grad_norm": 0.02238614670932293,
      "kl": 0.016913890838623047,
      "learning_rate": 9.718245238567939e-06,
      "loss": -0.0162,
      "num_tokens": 58824655.0,
      "reward": 0.38861343264579773,
      "reward_std": 0.06800764799118042,
      "rewards/gemini_judge_reward_func/mean": 0.1316964328289032,
      "rewards/gemini_judge_reward_func/std": 0.258184015750885,
      "rewards/semantic_correctness_reward_func/mean": 0.39789751172065735,
      "rewards/semantic_correctness_reward_func/std": 0.21606741845607758,
      "rewards/xmlcount_reward_func/mean": 0.6408883333206177,
      "rewards/xmlcount_reward_func/std": 0.480337917804718,
      "step": 165
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 765.0,
      "completions/mean_length": 157.02232360839844,
      "completions/mean_terminated_length": 149.2117156982422,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.05667705176902394,
      "grad_norm": 0.021430594846606255,
      "kl": 0.01669931411743164,
      "learning_rate": 9.712747326859316e-06,
      "loss": -0.0294,
      "num_tokens": 59189008.0,
      "reward": 0.37962663173675537,
      "reward_std": 0.059822093695402145,
      "rewards/gemini_judge_reward_func/mean": 0.1060267835855484,
      "rewards/gemini_judge_reward_func/std": 0.2311926931142807,
      "rewards/semantic_correctness_reward_func/mean": 0.3990795314311981,
      "rewards/semantic_correctness_reward_func/std": 0.21964147686958313,
      "rewards/xmlcount_reward_func/mean": 0.643500030040741,
      "rewards/xmlcount_reward_func/std": 0.48071083426475525,
      "step": 166
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 605.0,
      "completions/mean_length": 178.00894165039062,
      "completions/mean_terminated_length": 154.72476196289062,
      "completions/min_length": 42.0,
      "completions/min_terminated_length": 42.0,
      "epoch": 0.05701847979172891,
      "grad_norm": 0.021441694349050522,
      "kl": 0.013471364974975586,
      "learning_rate": 9.707197877300974e-06,
      "loss": 0.0005,
      "num_tokens": 59532526.0,
      "reward": 0.46267572045326233,
      "reward_std": 0.08093573898077011,
      "rewards/gemini_judge_reward_func/mean": 0.1383928507566452,
      "rewards/gemini_judge_reward_func/std": 0.26541972160339355,
      "rewards/semantic_correctness_reward_func/mean": 0.4269498884677887,
      "rewards/semantic_correctness_reward_func/std": 0.22219188511371613,
      "rewards/xmlcount_reward_func/mean": 0.8048214316368103,
      "rewards/xmlcount_reward_func/std": 0.39773449301719666,
      "step": 167
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 921.0,
      "completions/mean_length": 153.24107360839844,
      "completions/mean_terminated_length": 145.39639282226562,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.05735990781443387,
      "grad_norm": 0.022561749443411827,
      "kl": 0.01429438591003418,
      "learning_rate": 9.701596950580807e-06,
      "loss": 0.0165,
      "num_tokens": 59871796.0,
      "reward": 0.4066465198993683,
      "reward_std": 0.06362809985876083,
      "rewards/gemini_judge_reward_func/mean": 0.1194196417927742,
      "rewards/gemini_judge_reward_func/std": 0.20902319252490997,
      "rewards/semantic_correctness_reward_func/mean": 0.4001430869102478,
      "rewards/semantic_correctness_reward_func/std": 0.19026781618595123,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903892993927,
      "step": 168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 317.0,
      "completions/mean_length": 159.53125,
      "completions/mean_terminated_length": 155.6547088623047,
      "completions/min_length": 57.0,
      "completions/min_terminated_length": 57.0,
      "epoch": 0.05770133583713883,
      "grad_norm": 0.022561749443411827,
      "kl": 0.013838768005371094,
      "learning_rate": 9.701596950580807e-06,
      "loss": -0.0069,
      "num_tokens": 60205287.0,
      "reward": 0.43320146203041077,
      "reward_std": 0.052795667201280594,
      "rewards/gemini_judge_reward_func/mean": 0.0859375,
      "rewards/gemini_judge_reward_func/std": 0.17600706219673157,
      "rewards/semantic_correctness_reward_func/mean": 0.38538211584091187,
      "rewards/semantic_correctness_reward_func/std": 0.2128061205148697,
      "rewards/xmlcount_reward_func/mean": 0.8043751120567322,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 169
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 556.0,
      "completions/mean_length": 163.66964721679688,
      "completions/mean_terminated_length": 155.91893005371094,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.0580427638598438,
      "grad_norm": 0.022247465327382088,
      "kl": 0.011807918548583984,
      "learning_rate": 9.69594460794965e-06,
      "loss": -0.0284,
      "num_tokens": 60531329.0,
      "reward": 0.4758225083351135,
      "reward_std": 0.05295069143176079,
      "rewards/gemini_judge_reward_func/mean": 0.1383928507566452,
      "rewards/gemini_judge_reward_func/std": 0.23990674316883087,
      "rewards/semantic_correctness_reward_func/mean": 0.44310349225997925,
      "rewards/semantic_correctness_reward_func/std": 0.22307425737380981,
      "rewards/xmlcount_reward_func/mean": 0.8296116590499878,
      "rewards/xmlcount_reward_func/std": 0.3723537027835846,
      "step": 170
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 472.0,
      "completions/mean_length": 157.7053680419922,
      "completions/mean_terminated_length": 145.94570922851562,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "epoch": 0.05838419188254876,
      "grad_norm": 0.02209446392953396,
      "kl": 0.01704883575439453,
      "learning_rate": 9.690240911220618e-06,
      "loss": -0.0096,
      "num_tokens": 60891823.0,
      "reward": 0.4018482267856598,
      "reward_std": 0.06590615957975388,
      "rewards/gemini_judge_reward_func/mean": 0.1160714253783226,
      "rewards/gemini_judge_reward_func/std": 0.23540008068084717,
      "rewards/semantic_correctness_reward_func/mean": 0.4185981750488281,
      "rewards/semantic_correctness_reward_func/std": 0.20078440010547638,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 171
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 491.0,
      "completions/mean_length": 162.10269165039062,
      "completions/mean_terminated_length": 142.42465209960938,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "epoch": 0.058725619905253726,
      "grad_norm": 0.023919543251395226,
      "kl": 0.018400192260742188,
      "learning_rate": 9.684485922768422e-06,
      "loss": -0.0181,
      "num_tokens": 61264334.0,
      "reward": 0.40431568026542664,
      "reward_std": 0.055412210524082184,
      "rewards/gemini_judge_reward_func/mean": 0.1383928507566452,
      "rewards/gemini_judge_reward_func/std": 0.2716815173625946,
      "rewards/semantic_correctness_reward_func/mean": 0.4220426380634308,
      "rewards/semantic_correctness_reward_func/std": 0.22644221782684326,
      "rewards/xmlcount_reward_func/mean": 0.6613750457763672,
      "rewards/xmlcount_reward_func/std": 0.47500187158584595,
      "step": 172
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 781.0,
      "completions/mean_length": 164.0625,
      "completions/mean_terminated_length": 156.31532287597656,
      "completions/min_length": 32.0,
      "completions/min_terminated_length": 32.0,
      "epoch": 0.05906704792795869,
      "grad_norm": 0.021184051409363747,
      "kl": 0.013810157775878906,
      "learning_rate": 9.678679705528699e-06,
      "loss": -0.0254,
      "num_tokens": 61618052.0,
      "reward": 0.43104228377342224,
      "reward_std": 0.06282518804073334,
      "rewards/gemini_judge_reward_func/mean": 0.1116071417927742,
      "rewards/gemini_judge_reward_func/std": 0.19904279708862305,
      "rewards/semantic_correctness_reward_func/mean": 0.44841665029525757,
      "rewards/semantic_correctness_reward_func/std": 0.19527798891067505,
      "rewards/xmlcount_reward_func/mean": 0.741790235042572,
      "rewards/xmlcount_reward_func/std": 0.4394533038139343,
      "step": 173
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 340.0,
      "completions/max_terminated_length": 340.0,
      "completions/mean_length": 141.54464721679688,
      "completions/mean_terminated_length": 141.54464721679688,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.05940847595066365,
      "grad_norm": 0.023413272574543953,
      "kl": 0.014312744140625,
      "learning_rate": 9.672822322997305e-06,
      "loss": 0.0039,
      "num_tokens": 61970298.0,
      "reward": 0.4327978491783142,
      "reward_std": 0.06107241287827492,
      "rewards/gemini_judge_reward_func/mean": 0.1350446492433548,
      "rewards/gemini_judge_reward_func/std": 0.26451072096824646,
      "rewards/semantic_correctness_reward_func/mean": 0.4281497001647949,
      "rewards/semantic_correctness_reward_func/std": 0.20419283211231232,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 174
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 921.0,
      "completions/mean_length": 160.125,
      "completions/mean_terminated_length": 156.25112915039062,
      "completions/min_length": 41.0,
      "completions/min_terminated_length": 41.0,
      "epoch": 0.059749903973368615,
      "grad_norm": 0.021777305752038956,
      "kl": 0.014169454574584961,
      "learning_rate": 9.666913839229639e-06,
      "loss": 0.0003,
      "num_tokens": 62331346.0,
      "reward": 0.4691229462623596,
      "reward_std": 0.07057663053274155,
      "rewards/gemini_judge_reward_func/mean": 0.1238839253783226,
      "rewards/gemini_judge_reward_func/std": 0.23075933754444122,
      "rewards/semantic_correctness_reward_func/mean": 0.45334672927856445,
      "rewards/semantic_correctness_reward_func/std": 0.21422763168811798,
      "rewards/xmlcount_reward_func/mean": 0.8222500681877136,
      "rewards/xmlcount_reward_func/std": 0.3842346966266632,
      "step": 175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 974.0,
      "completions/mean_length": 155.46429443359375,
      "completions/mean_terminated_length": 151.56951904296875,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.060091331996073576,
      "grad_norm": 0.025058435276150703,
      "kl": 0.030515670776367188,
      "learning_rate": 9.660954318839934e-06,
      "loss": 0.0045,
      "num_tokens": 62678126.0,
      "reward": 0.46425560116767883,
      "reward_std": 0.057500917464494705,
      "rewards/gemini_judge_reward_func/mean": 0.1339285671710968,
      "rewards/gemini_judge_reward_func/std": 0.25483787059783936,
      "rewards/semantic_correctness_reward_func/mean": 0.44467073678970337,
      "rewards/semantic_correctness_reward_func/std": 0.22635166347026825,
      "rewards/xmlcount_reward_func/mean": 0.8043750524520874,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 176
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 349.0,
      "completions/mean_length": 158.9419708251953,
      "completions/mean_terminated_length": 155.0627899169922,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.06043276001877854,
      "grad_norm": 0.02151457406580448,
      "kl": 0.014577388763427734,
      "learning_rate": 9.654943827000548e-06,
      "loss": -0.0264,
      "num_tokens": 63019985.0,
      "reward": 0.44926339387893677,
      "reward_std": 0.06779324263334274,
      "rewards/gemini_judge_reward_func/mean": 0.1127232164144516,
      "rewards/gemini_judge_reward_func/std": 0.22175651788711548,
      "rewards/semantic_correctness_reward_func/mean": 0.4478701949119568,
      "rewards/semantic_correctness_reward_func/std": 0.2044634222984314,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 177
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 944.0,
      "completions/mean_length": 164.93304443359375,
      "completions/mean_terminated_length": 157.19369506835938,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.060774188041483504,
      "grad_norm": 0.027191217988729477,
      "kl": 0.01347041130065918,
      "learning_rate": 9.648882429441258e-06,
      "loss": -0.0145,
      "num_tokens": 63344290.0,
      "reward": 0.46692782640457153,
      "reward_std": 0.06793376803398132,
      "rewards/gemini_judge_reward_func/mean": 0.1026785746216774,
      "rewards/gemini_judge_reward_func/std": 0.20789778232574463,
      "rewards/semantic_correctness_reward_func/mean": 0.4490319788455963,
      "rewards/semantic_correctness_reward_func/std": 0.21256029605865479,
      "rewards/xmlcount_reward_func/mean": 0.8401250243186951,
      "rewards/xmlcount_reward_func/std": 0.3684578537940979,
      "step": 178
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 307.0,
      "completions/max_terminated_length": 307.0,
      "completions/mean_length": 148.0982208251953,
      "completions/mean_terminated_length": 148.0982208251953,
      "completions/min_length": 38.0,
      "completions/min_terminated_length": 38.0,
      "epoch": 0.06111561606418847,
      "grad_norm": 0.022637570276856422,
      "kl": 0.012826919555664062,
      "learning_rate": 9.642770192448537e-06,
      "loss": -0.0074,
      "num_tokens": 63716068.0,
      "reward": 0.45914244651794434,
      "reward_std": 0.0858091339468956,
      "rewards/gemini_judge_reward_func/mean": 0.1473214328289032,
      "rewards/gemini_judge_reward_func/std": 0.2583874762058258,
      "rewards/semantic_correctness_reward_func/mean": 0.4231850206851959,
      "rewards/semantic_correctness_reward_func/std": 0.21553608775138855,
      "rewards/xmlcount_reward_func/mean": 0.7889420390129089,
      "rewards/xmlcount_reward_func/std": 0.4050236642360687,
      "step": 179
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 578.0,
      "completions/max_terminated_length": 578.0,
      "completions/mean_length": 153.35714721679688,
      "completions/mean_terminated_length": 153.35714721679688,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.06145704408689343,
      "grad_norm": 0.022738490253686905,
      "kl": 0.013818740844726562,
      "learning_rate": 9.636607182864828e-06,
      "loss": -0.0012,
      "num_tokens": 64067068.0,
      "reward": 0.47211530804634094,
      "reward_std": 0.05404976010322571,
      "rewards/gemini_judge_reward_func/mean": 0.1417410671710968,
      "rewards/gemini_judge_reward_func/std": 0.24774518609046936,
      "rewards/semantic_correctness_reward_func/mean": 0.45715656876564026,
      "rewards/semantic_correctness_reward_func/std": 0.2026260793209076,
      "rewards/xmlcount_reward_func/mean": 0.8099688291549683,
      "rewards/xmlcount_reward_func/std": 0.3902978301048279,
      "step": 180
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 592.0,
      "completions/mean_length": 150.27232360839844,
      "completions/mean_terminated_length": 146.35426330566406,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "epoch": 0.06179847210959839,
      "grad_norm": 0.024008719250559807,
      "kl": 0.01700735092163086,
      "learning_rate": 9.630393468087818e-06,
      "loss": 0.0122,
      "num_tokens": 64438281.0,
      "reward": 0.4074629247188568,
      "reward_std": 0.053221508860588074,
      "rewards/gemini_judge_reward_func/mean": 0.0959821417927742,
      "rewards/gemini_judge_reward_func/std": 0.19159916043281555,
      "rewards/semantic_correctness_reward_func/mean": 0.4153500497341156,
      "rewards/semantic_correctness_reward_func/std": 0.1790420562028885,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 181
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 413.0,
      "completions/max_terminated_length": 413.0,
      "completions/mean_length": 154.60714721679688,
      "completions/mean_terminated_length": 154.60714721679688,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.06213990013230336,
      "grad_norm": 0.022805478423833847,
      "kl": 0.012624263763427734,
      "learning_rate": 9.624129116069695e-06,
      "loss": -0.0175,
      "num_tokens": 64764149.0,
      "reward": 0.4789244532585144,
      "reward_std": 0.07191872596740723,
      "rewards/gemini_judge_reward_func/mean": 0.125,
      "rewards/gemini_judge_reward_func/std": 0.24718141555786133,
      "rewards/semantic_correctness_reward_func/mean": 0.4286222457885742,
      "rewards/semantic_correctness_reward_func/std": 0.22481898963451385,
      "rewards/xmlcount_reward_func/mean": 0.8580000996589661,
      "rewards/xmlcount_reward_func/std": 0.35106155276298523,
      "step": 182
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 494.0,
      "completions/max_terminated_length": 494.0,
      "completions/mean_length": 146.19644165039062,
      "completions/mean_terminated_length": 146.19644165039062,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.06248132815500832,
      "grad_norm": 0.02222471684217453,
      "kl": 0.014984130859375,
      "learning_rate": 9.61781419531641e-06,
      "loss": -0.0111,
      "num_tokens": 65122957.0,
      "reward": 0.4180612862110138,
      "reward_std": 0.056327447295188904,
      "rewards/gemini_judge_reward_func/mean": 0.1049107164144516,
      "rewards/gemini_judge_reward_func/std": 0.20677535235881805,
      "rewards/semantic_correctness_reward_func/mean": 0.39460986852645874,
      "rewards/semantic_correctness_reward_func/std": 0.1847928911447525,
      "rewards/xmlcount_reward_func/mean": 0.7429375648498535,
      "rewards/xmlcount_reward_func/std": 0.43272262811660767,
      "step": 183
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 531.0,
      "completions/max_terminated_length": 531.0,
      "completions/mean_length": 142.21875,
      "completions/mean_terminated_length": 142.21875,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.06282275617771328,
      "grad_norm": 0.02309420332312584,
      "kl": 0.017095088958740234,
      "learning_rate": 9.611448774886925e-06,
      "loss": 0.0017,
      "num_tokens": 65481062.0,
      "reward": 0.40878623723983765,
      "reward_std": 0.05744129791855812,
      "rewards/gemini_judge_reward_func/mean": 0.1071428582072258,
      "rewards/gemini_judge_reward_func/std": 0.24194253981113434,
      "rewards/semantic_correctness_reward_func/mean": 0.39964547753334045,
      "rewards/semantic_correctness_reward_func/std": 0.1898690164089203,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 184
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 677.0,
      "completions/mean_length": 163.8482208251953,
      "completions/mean_terminated_length": 156.09910583496094,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.06316418420041825,
      "grad_norm": 0.02297145500779152,
      "kl": 0.012150287628173828,
      "learning_rate": 9.605032924392457e-06,
      "loss": -0.0288,
      "num_tokens": 65832704.0,
      "reward": 0.43201377987861633,
      "reward_std": 0.06907930970191956,
      "rewards/gemini_judge_reward_func/mean": 0.1138392835855484,
      "rewards/gemini_judge_reward_func/std": 0.2304907888174057,
      "rewards/semantic_correctness_reward_func/mean": 0.4308900535106659,
      "rewards/semantic_correctness_reward_func/std": 0.2030285894870758,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 359.0,
      "completions/max_terminated_length": 359.0,
      "completions/mean_length": 150.8928680419922,
      "completions/mean_terminated_length": 150.8928680419922,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.06350561222312322,
      "grad_norm": 0.02199730835855007,
      "kl": 0.012223243713378906,
      "learning_rate": 9.598566713995718e-06,
      "loss": -0.006,
      "num_tokens": 66182992.0,
      "reward": 0.46267858147621155,
      "reward_std": 0.055922579020261765,
      "rewards/gemini_judge_reward_func/mean": 0.1395089328289032,
      "rewards/gemini_judge_reward_func/std": 0.23512086272239685,
      "rewards/semantic_correctness_reward_func/mean": 0.46137502789497375,
      "rewards/semantic_correctness_reward_func/std": 0.20330458879470825,
      "rewards/xmlcount_reward_func/mean": 0.786500096321106,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 186
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 627.0,
      "completions/mean_length": 161.57144165039062,
      "completions/mean_terminated_length": 149.8642578125,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.06384704024582817,
      "grad_norm": 0.023103708401322365,
      "kl": 0.014789819717407227,
      "learning_rate": 9.592050214410152e-06,
      "loss": -0.0102,
      "num_tokens": 66515820.0,
      "reward": 0.43203839659690857,
      "reward_std": 0.06265180557966232,
      "rewards/gemini_judge_reward_func/mean": 0.1183035746216774,
      "rewards/gemini_judge_reward_func/std": 0.22575190663337708,
      "rewards/semantic_correctness_reward_func/mean": 0.4220845401287079,
      "rewards/semantic_correctness_reward_func/std": 0.2233872264623642,
      "rewards/xmlcount_reward_func/mean": 0.7507500648498535,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 187
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 259.0,
      "completions/mean_length": 155.8303680419922,
      "completions/mean_terminated_length": 140.0454559326172,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.06418846826853314,
      "grad_norm": 0.02313879318535328,
      "kl": 0.014795541763305664,
      "learning_rate": 9.585483496899151e-06,
      "loss": -0.0222,
      "num_tokens": 66879166.0,
      "reward": 0.44597339630126953,
      "reward_std": 0.07401511073112488,
      "rewards/gemini_judge_reward_func/mean": 0.1640625,
      "rewards/gemini_judge_reward_func/std": 0.3006584942340851,
      "rewards/semantic_correctness_reward_func/mean": 0.42083999514579773,
      "rewards/semantic_correctness_reward_func/std": 0.2109062224626541,
      "rewards/xmlcount_reward_func/mean": 0.740450918674469,
      "rewards/xmlcount_reward_func/std": 0.439119815826416,
      "step": 188
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 768.0,
      "completions/max_terminated_length": 768.0,
      "completions/mean_length": 146.3482208251953,
      "completions/mean_terminated_length": 146.3482208251953,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.0645298962912381,
      "grad_norm": 0.022025736048817635,
      "kl": 0.014230012893676758,
      "learning_rate": 9.578866633275289e-06,
      "loss": -0.0026,
      "num_tokens": 67217452.0,
      "reward": 0.4548703730106354,
      "reward_std": 0.05710577219724655,
      "rewards/gemini_judge_reward_func/mean": 0.1004464253783226,
      "rewards/gemini_judge_reward_func/std": 0.22450698912143707,
      "rewards/semantic_correctness_reward_func/mean": 0.39320898056030273,
      "rewards/semantic_correctness_reward_func/std": 0.22224441170692444,
      "rewards/xmlcount_reward_func/mean": 0.8401250243186951,
      "rewards/xmlcount_reward_func/std": 0.3684578835964203,
      "step": 189
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 310.0,
      "completions/mean_length": 159.1116180419922,
      "completions/mean_terminated_length": 147.3710479736328,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.06487132431394306,
      "grad_norm": 0.026655780151486397,
      "kl": 0.013226747512817383,
      "learning_rate": 9.572199695899522e-06,
      "loss": 0.0077,
      "num_tokens": 67551205.0,
      "reward": 0.41442981362342834,
      "reward_std": 0.05420489236712456,
      "rewards/gemini_judge_reward_func/mean": 0.0859375,
      "rewards/gemini_judge_reward_func/std": 0.19272884726524353,
      "rewards/semantic_correctness_reward_func/mean": 0.4345238506793976,
      "rewards/semantic_correctness_reward_func/std": 0.1913345605134964,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 190
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 706.0,
      "completions/mean_length": 156.5803680419922,
      "completions/mean_terminated_length": 144.80543518066406,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.06521275233664803,
      "grad_norm": 0.022954002022743225,
      "kl": 0.013891220092773438,
      "learning_rate": 9.565482757680415e-06,
      "loss": -0.0215,
      "num_tokens": 67917511.0,
      "reward": 0.4133635461330414,
      "reward_std": 0.07708757370710373,
      "rewards/gemini_judge_reward_func/mean": 0.1160714253783226,
      "rewards/gemini_judge_reward_func/std": 0.2281446009874344,
      "rewards/semantic_correctness_reward_func/mean": 0.44113001227378845,
      "rewards/semantic_correctness_reward_func/std": 0.18605509400367737,
      "rewards/xmlcount_reward_func/mean": 0.6967723965644836,
      "rewards/xmlcount_reward_func/std": 0.5375342965126038,
      "step": 191
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 440.0,
      "completions/mean_length": 152.76339721679688,
      "completions/mean_terminated_length": 140.93666076660156,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.065554180359353,
      "grad_norm": 0.02197893150150776,
      "kl": 0.013241052627563477,
      "learning_rate": 9.558715892073324e-06,
      "loss": -0.0152,
      "num_tokens": 68291746.0,
      "reward": 0.41031619906425476,
      "reward_std": 0.0750560611486435,
      "rewards/gemini_judge_reward_func/mean": 0.1361607164144516,
      "rewards/gemini_judge_reward_func/std": 0.26340386271476746,
      "rewards/semantic_correctness_reward_func/mean": 0.43535763025283813,
      "rewards/semantic_correctness_reward_func/std": 0.2265578806400299,
      "rewards/xmlcount_reward_func/mean": 0.6719509363174438,
      "rewards/xmlcount_reward_func/std": 0.4701566696166992,
      "step": 192
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 907.0,
      "completions/mean_length": 155.22769165039062,
      "completions/mean_terminated_length": 143.4344024658203,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.06589560838205796,
      "grad_norm": 0.02204836718738079,
      "kl": 0.015254497528076172,
      "learning_rate": 9.551899173079607e-06,
      "loss": 0.0193,
      "num_tokens": 68675557.0,
      "reward": 0.4088119864463806,
      "reward_std": 0.06842758506536484,
      "rewards/gemini_judge_reward_func/mean": 0.1462053507566452,
      "rewards/gemini_judge_reward_func/std": 0.28724196553230286,
      "rewards/semantic_correctness_reward_func/mean": 0.46464914083480835,
      "rewards/semantic_correctness_reward_func/std": 0.21852509677410126,
      "rewards/xmlcount_reward_func/mean": 0.643500030040741,
      "rewards/xmlcount_reward_func/std": 0.48071083426475525,
      "step": 193
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 528.0,
      "completions/mean_length": 164.80804443359375,
      "completions/mean_terminated_length": 153.14480590820312,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "epoch": 0.06623703640476292,
      "grad_norm": 0.021376753225922585,
      "kl": 0.011332511901855469,
      "learning_rate": 9.545032675245814e-06,
      "loss": -0.0086,
      "num_tokens": 69038770.0,
      "reward": 0.47529637813568115,
      "reward_std": 0.0824359580874443,
      "rewards/gemini_judge_reward_func/mean": 0.1506696492433548,
      "rewards/gemini_judge_reward_func/std": 0.26871559023857117,
      "rewards/semantic_correctness_reward_func/mean": 0.4306424558162689,
      "rewards/semantic_correctness_reward_func/std": 0.21526159346103668,
      "rewards/xmlcount_reward_func/mean": 0.8222500681877136,
      "rewards/xmlcount_reward_func/std": 0.3842346966266632,
      "step": 194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 900.0,
      "completions/mean_length": 169.97769165039062,
      "completions/mean_terminated_length": 154.4499969482422,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.06657846442746788,
      "grad_norm": 0.021403852850198746,
      "kl": 0.012527227401733398,
      "learning_rate": 9.538116473662862e-06,
      "loss": -0.0267,
      "num_tokens": 69391789.0,
      "reward": 0.3966446816921234,
      "reward_std": 0.046989116817712784,
      "rewards/gemini_judge_reward_func/mean": 0.0970982164144516,
      "rewards/gemini_judge_reward_func/std": 0.21387535333633423,
      "rewards/semantic_correctness_reward_func/mean": 0.3947768807411194,
      "rewards/semantic_correctness_reward_func/std": 0.1843794733285904,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903594970703,
      "step": 195
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 739.0,
      "completions/mean_length": 151.91519165039062,
      "completions/mean_terminated_length": 148.00448608398438,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.06691989245017285,
      "grad_norm": 0.022233830764889717,
      "kl": 0.012385845184326172,
      "learning_rate": 9.531150643965224e-06,
      "loss": -0.0391,
      "num_tokens": 69735454.0,
      "reward": 0.45047202706336975,
      "reward_std": 0.0631929263472557,
      "rewards/gemini_judge_reward_func/mean": 0.1350446492433548,
      "rewards/gemini_judge_reward_func/std": 0.21152234077453613,
      "rewards/semantic_correctness_reward_func/mean": 0.42714568972587585,
      "rewards/semantic_correctness_reward_func/std": 0.1992858350276947,
      "rewards/xmlcount_reward_func/mean": 0.7775625586509705,
      "rewards/xmlcount_reward_func/std": 0.4177508056163788,
      "step": 196
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 724.0,
      "completions/mean_length": 163.46429443359375,
      "completions/mean_terminated_length": 147.81817626953125,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.0672613204728778,
      "grad_norm": 0.02109030820429325,
      "kl": 0.013692617416381836,
      "learning_rate": 9.524135262330098e-06,
      "loss": -0.0261,
      "num_tokens": 70092946.0,
      "reward": 0.4639114439487457,
      "reward_std": 0.08710245043039322,
      "rewards/gemini_judge_reward_func/mean": 0.1651785671710968,
      "rewards/gemini_judge_reward_func/std": 0.2721233069896698,
      "rewards/semantic_correctness_reward_func/mean": 0.4921642243862152,
      "rewards/semantic_correctness_reward_func/std": 0.19522406160831451,
      "rewards/xmlcount_reward_func/mean": 0.7485179305076599,
      "rewards/xmlcount_reward_func/std": 0.4344094395637512,
      "step": 197
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 529.0,
      "completions/mean_length": 155.98214721679688,
      "completions/mean_terminated_length": 140.1999969482422,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.06760274849558277,
      "grad_norm": 0.02615966647863388,
      "kl": 0.017928600311279297,
      "learning_rate": 9.517070405476575e-06,
      "loss": -0.0153,
      "num_tokens": 70460834.0,
      "reward": 0.41672831773757935,
      "reward_std": 0.06112901121377945,
      "rewards/gemini_judge_reward_func/mean": 0.1417410671710968,
      "rewards/gemini_judge_reward_func/std": 0.23377598822116852,
      "rewards/semantic_correctness_reward_func/mean": 0.44165927171707153,
      "rewards/semantic_correctness_reward_func/std": 0.20446263253688812,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 198
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 564.0,
      "completions/mean_length": 166.02232360839844,
      "completions/mean_terminated_length": 154.37557983398438,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.06794417651828774,
      "grad_norm": 0.021424876525998116,
      "kl": 0.014705181121826172,
      "learning_rate": 9.509956150664796e-06,
      "loss": -0.002,
      "num_tokens": 70813555.0,
      "reward": 0.4322836995124817,
      "reward_std": 0.06650731712579727,
      "rewards/gemini_judge_reward_func/mean": 0.1116071417927742,
      "rewards/gemini_judge_reward_func/std": 0.22669215500354767,
      "rewards/semantic_correctness_reward_func/mean": 0.4367040693759918,
      "rewards/semantic_correctness_reward_func/std": 0.20320431888103485,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 199
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 819.0,
      "completions/mean_length": 173.4553680419922,
      "completions/mean_terminated_length": 161.90951538085938,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.06828560454099271,
      "grad_norm": 0.02211941033601761,
      "kl": 0.014380931854248047,
      "learning_rate": 9.502792575695112e-06,
      "loss": -0.0222,
      "num_tokens": 71159069.0,
      "reward": 0.4176250398159027,
      "reward_std": 0.05642692372202873,
      "rewards/gemini_judge_reward_func/mean": 0.1160714253783226,
      "rewards/gemini_judge_reward_func/std": 0.2219172716140747,
      "rewards/semantic_correctness_reward_func/mean": 0.425982266664505,
      "rewards/semantic_correctness_reward_func/std": 0.18948838114738464,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 200
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 585.0,
      "completions/max_terminated_length": 585.0,
      "completions/mean_length": 158.24554443359375,
      "completions/mean_terminated_length": 158.24554443359375,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.06862703256369766,
      "grad_norm": 0.021084995940327644,
      "kl": 0.012256860733032227,
      "learning_rate": 9.495579758907231e-06,
      "loss": -0.0282,
      "num_tokens": 71504732.0,
      "reward": 0.47500061988830566,
      "reward_std": 0.06313009560108185,
      "rewards/gemini_judge_reward_func/mean": 0.1328125,
      "rewards/gemini_judge_reward_func/std": 0.22941070795059204,
      "rewards/semantic_correctness_reward_func/mean": 0.4291277825832367,
      "rewards/semantic_correctness_reward_func/std": 0.20796488225460052,
      "rewards/xmlcount_reward_func/mean": 0.8401250839233398,
      "rewards/xmlcount_reward_func/std": 0.3684578537940979,
      "step": 201
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 335.0,
      "completions/mean_length": 164.35714721679688,
      "completions/mean_terminated_length": 152.6877899169922,
      "completions/min_length": 32.0,
      "completions/min_terminated_length": 32.0,
      "epoch": 0.06896846058640263,
      "grad_norm": 0.02392446994781494,
      "kl": 0.014485836029052734,
      "learning_rate": 9.48831777917936e-06,
      "loss": -0.0129,
      "num_tokens": 71864228.0,
      "reward": 0.458452969789505,
      "reward_std": 0.0712665319442749,
      "rewards/gemini_judge_reward_func/mean": 0.1238839253783226,
      "rewards/gemini_judge_reward_func/std": 0.23075933754444122,
      "rewards/semantic_correctness_reward_func/mean": 0.44468429684638977,
      "rewards/semantic_correctness_reward_func/std": 0.21000932157039642,
      "rewards/xmlcount_reward_func/mean": 0.799906313419342,
      "rewards/xmlcount_reward_func/std": 0.40196701884269714,
      "step": 202
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 956.0,
      "completions/mean_length": 155.2232208251953,
      "completions/mean_terminated_length": 151.32736206054688,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.0693098886091076,
      "grad_norm": 0.021716050803661346,
      "kl": 0.014530658721923828,
      "learning_rate": 9.481006715927352e-06,
      "loss": -0.0081,
      "num_tokens": 72209762.0,
      "reward": 0.43672648072242737,
      "reward_std": 0.051114633679389954,
      "rewards/gemini_judge_reward_func/mean": 0.1104910746216774,
      "rewards/gemini_judge_reward_func/std": 0.24901461601257324,
      "rewards/semantic_correctness_reward_func/mean": 0.42540010809898376,
      "rewards/semantic_correctness_reward_func/std": 0.20755036175251007,
      "rewards/xmlcount_reward_func/mean": 0.7686249613761902,
      "rewards/xmlcount_reward_func/std": 0.42356839776039124,
      "step": 203
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 164.58482360839844,
      "completions/mean_terminated_length": 148.95909118652344,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.06965131663181255,
      "grad_norm": 0.023567847907543182,
      "kl": 0.0158233642578125,
      "learning_rate": 9.473646649103819e-06,
      "loss": -0.0073,
      "num_tokens": 72559321.0,
      "reward": 0.3971547782421112,
      "reward_std": 0.06843477487564087,
      "rewards/gemini_judge_reward_func/mean": 0.0848214253783226,
      "rewards/gemini_judge_reward_func/std": 0.19823653995990753,
      "rewards/semantic_correctness_reward_func/mean": 0.37718465924263,
      "rewards/semantic_correctness_reward_func/std": 0.20006071031093597,
      "rewards/xmlcount_reward_func/mean": 0.7194733023643494,
      "rewards/xmlcount_reward_func/std": 0.44856736063957214,
      "step": 204
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 933.0,
      "completions/mean_length": 165.51339721679688,
      "completions/mean_terminated_length": 157.77928161621094,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.06999274465451752,
      "grad_norm": 0.02119120955467224,
      "kl": 0.014370441436767578,
      "learning_rate": 9.466237659197271e-06,
      "loss": 0.0142,
      "num_tokens": 72915740.0,
      "reward": 0.4165930151939392,
      "reward_std": 0.05669743940234184,
      "rewards/gemini_judge_reward_func/mean": 0.1071428582072258,
      "rewards/gemini_judge_reward_func/std": 0.22389310598373413,
      "rewards/semantic_correctness_reward_func/mean": 0.40292924642562866,
      "rewards/semantic_correctness_reward_func/std": 0.19625583291053772,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 205
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 161.34375,
      "completions/mean_terminated_length": 145.65908813476562,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.07033417267722249,
      "grad_norm": 0.02108367159962654,
      "kl": 0.017071247100830078,
      "learning_rate": 9.458779827231237e-06,
      "loss": -0.0316,
      "num_tokens": 73288101.0,
      "reward": 0.4331299364566803,
      "reward_std": 0.06814208626747131,
      "rewards/gemini_judge_reward_func/mean": 0.1607142835855484,
      "rewards/gemini_judge_reward_func/std": 0.28579434752464294,
      "rewards/semantic_correctness_reward_func/mean": 0.44997096061706543,
      "rewards/semantic_correctness_reward_func/std": 0.19737227261066437,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903594970703,
      "step": 206
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 496.0,
      "completions/mean_length": 172.92857360839844,
      "completions/mean_terminated_length": 149.50457763671875,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.07067560069992744,
      "grad_norm": 0.023088613525032997,
      "kl": 0.015703678131103516,
      "learning_rate": 9.451273234763372e-06,
      "loss": 0.0331,
      "num_tokens": 73661089.0,
      "reward": 0.4168475568294525,
      "reward_std": 0.05705378204584122,
      "rewards/gemini_judge_reward_func/mean": 0.1350446492433548,
      "rewards/gemini_judge_reward_func/std": 0.27490234375,
      "rewards/semantic_correctness_reward_func/mean": 0.4198981821537018,
      "rewards/semantic_correctness_reward_func/std": 0.19315725564956665,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903892993927,
      "step": 207
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 161.10714721679688,
      "completions/mean_terminated_length": 157.2376708984375,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.07101702872263241,
      "grad_norm": 0.022637007758021355,
      "kl": 0.01239013671875,
      "learning_rate": 9.443717963884568e-06,
      "loss": -0.0349,
      "num_tokens": 74021557.0,
      "reward": 0.4410480856895447,
      "reward_std": 0.06117826700210571,
      "rewards/gemini_judge_reward_func/mean": 0.1439732164144516,
      "rewards/gemini_judge_reward_func/std": 0.2487129420042038,
      "rewards/semantic_correctness_reward_func/mean": 0.41579389572143555,
      "rewards/semantic_correctness_reward_func/std": 0.1910022795200348,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 208
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 499.0,
      "completions/mean_length": 163.0982208251953,
      "completions/mean_terminated_length": 155.34234619140625,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.07135845674533738,
      "grad_norm": 0.021986160427331924,
      "kl": 0.01541757583618164,
      "learning_rate": 9.43611409721806e-06,
      "loss": -0.0005,
      "num_tokens": 74362631.0,
      "reward": 0.4239736497402191,
      "reward_std": 0.053107425570487976,
      "rewards/gemini_judge_reward_func/mean": 0.1149553582072258,
      "rewards/gemini_judge_reward_func/std": 0.249235600233078,
      "rewards/semantic_correctness_reward_func/mean": 0.38845744729042053,
      "rewards/semantic_correctness_reward_func/std": 0.2081860601902008,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 209
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 844.0,
      "completions/mean_length": 182.16964721679688,
      "completions/mean_terminated_length": 159.0,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "epoch": 0.07169988476804234,
      "grad_norm": 0.020678477361798286,
      "kl": 0.01442861557006836,
      "learning_rate": 9.428461717918512e-06,
      "loss": -0.0257,
      "num_tokens": 74725897.0,
      "reward": 0.4402584433555603,
      "reward_std": 0.06913614273071289,
      "rewards/gemini_judge_reward_func/mean": 0.140625,
      "rewards/gemini_judge_reward_func/std": 0.24093718826770782,
      "rewards/semantic_correctness_reward_func/mean": 0.45429208874702454,
      "rewards/semantic_correctness_reward_func/std": 0.22150787711143494,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 210
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 459.0,
      "completions/mean_length": 156.3125,
      "completions/mean_terminated_length": 144.533935546875,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.0720413127907473,
      "grad_norm": 0.02135198749601841,
      "kl": 0.014465570449829102,
      "learning_rate": 9.420760909671119e-06,
      "loss": -0.0045,
      "num_tokens": 75080187.0,
      "reward": 0.3945431113243103,
      "reward_std": 0.060559362173080444,
      "rewards/gemini_judge_reward_func/mean": 0.1205357164144516,
      "rewards/gemini_judge_reward_func/std": 0.26146823167800903,
      "rewards/semantic_correctness_reward_func/mean": 0.408893883228302,
      "rewards/semantic_correctness_reward_func/std": 0.1875745952129364,
      "rewards/xmlcount_reward_func/mean": 0.6613750457763672,
      "rewards/xmlcount_reward_func/std": 0.47500187158584595,
      "step": 211
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 289.0,
      "completions/mean_length": 164.40179443359375,
      "completions/mean_terminated_length": 144.7762451171875,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.07238274081345226,
      "grad_norm": 0.02135198749601841,
      "kl": 0.01694631576538086,
      "learning_rate": 9.420760909671119e-06,
      "loss": -0.0253,
      "num_tokens": 75444185.0,
      "reward": 0.4022166430950165,
      "reward_std": 0.05362841486930847,
      "rewards/gemini_judge_reward_func/mean": 0.1361607164144516,
      "rewards/gemini_judge_reward_func/std": 0.23767462372779846,
      "rewards/semantic_correctness_reward_func/mean": 0.4517615735530853,
      "rewards/semantic_correctness_reward_func/std": 0.20318275690078735,
      "rewards/xmlcount_reward_func/mean": 0.643500030040741,
      "rewards/xmlcount_reward_func/std": 0.48071083426475525,
      "step": 212
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 815.0,
      "completions/mean_length": 159.13839721679688,
      "completions/mean_terminated_length": 147.398193359375,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.07272416883615723,
      "grad_norm": 0.021344488486647606,
      "kl": 0.015068531036376953,
      "learning_rate": 9.413011756690686e-06,
      "loss": -0.0179,
      "num_tokens": 75804152.0,
      "reward": 0.4192578196525574,
      "reward_std": 0.056778065860271454,
      "rewards/gemini_judge_reward_func/mean": 0.1037946417927742,
      "rewards/gemini_judge_reward_func/std": 0.22484947741031647,
      "rewards/semantic_correctness_reward_func/mean": 0.42294973134994507,
      "rewards/semantic_correctness_reward_func/std": 0.22244691848754883,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 213
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 328.0,
      "completions/max_terminated_length": 328.0,
      "completions/mean_length": 144.6741180419922,
      "completions/mean_terminated_length": 144.6741180419922,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.07306559685886219,
      "grad_norm": 0.02157749980688095,
      "kl": 0.014754772186279297,
      "learning_rate": 9.405214343720708e-06,
      "loss": -0.0117,
      "num_tokens": 76156359.0,
      "reward": 0.42160260677337646,
      "reward_std": 0.054873332381248474,
      "rewards/gemini_judge_reward_func/mean": 0.0848214253783226,
      "rewards/gemini_judge_reward_func/std": 0.19249825179576874,
      "rewards/semantic_correctness_reward_func/mean": 0.4081380069255829,
      "rewards/semantic_correctness_reward_func/std": 0.1861819475889206,
      "rewards/xmlcount_reward_func/mean": 0.7651161551475525,
      "rewards/xmlcount_reward_func/std": 0.4228045344352722,
      "step": 214
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 336.0,
      "completions/mean_length": 170.70089721679688,
      "completions/mean_terminated_length": 155.1863555908203,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "epoch": 0.07340702488156715,
      "grad_norm": 0.02151479572057724,
      "kl": 0.014784812927246094,
      "learning_rate": 9.397368756032445e-06,
      "loss": -0.0108,
      "num_tokens": 76514364.0,
      "reward": 0.42958998680114746,
      "reward_std": 0.058609623461961746,
      "rewards/gemini_judge_reward_func/mean": 0.1160714253783226,
      "rewards/gemini_judge_reward_func/std": 0.2244289368391037,
      "rewards/semantic_correctness_reward_func/mean": 0.41548532247543335,
      "rewards/semantic_correctness_reward_func/std": 0.2050415575504303,
      "rewards/xmlcount_reward_func/mean": 0.7501607537269592,
      "rewards/xmlcount_reward_func/std": 0.4322551190853119,
      "step": 215
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 730.0,
      "completions/mean_length": 160.38394165039062,
      "completions/mean_terminated_length": 156.51121520996094,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.07374845290427212,
      "grad_norm": 0.02160024829208851,
      "kl": 0.015204429626464844,
      "learning_rate": 9.389475079423988e-06,
      "loss": 0.0052,
      "num_tokens": 76871318.0,
      "reward": 0.40496453642845154,
      "reward_std": 0.06893553584814072,
      "rewards/gemini_judge_reward_func/mean": 0.1339285671710968,
      "rewards/gemini_judge_reward_func/std": 0.25151684880256653,
      "rewards/semantic_correctness_reward_func/mean": 0.4342152774333954,
      "rewards/semantic_correctness_reward_func/std": 0.20986708998680115,
      "rewards/xmlcount_reward_func/mean": 0.6613750457763672,
      "rewards/xmlcount_reward_func/std": 0.47500187158584595,
      "step": 216
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 635.0,
      "completions/mean_length": 152.30804443359375,
      "completions/mean_terminated_length": 148.39910888671875,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.07408988092697708,
      "grad_norm": 0.02134779281914234,
      "kl": 0.01924419403076172,
      "learning_rate": 9.381533400219319e-06,
      "loss": 0.0234,
      "num_tokens": 77235119.0,
      "reward": 0.47121262550354004,
      "reward_std": 0.07921778410673141,
      "rewards/gemini_judge_reward_func/mean": 0.1674107164144516,
      "rewards/gemini_judge_reward_func/std": 0.27890798449516296,
      "rewards/semantic_correctness_reward_func/mean": 0.4839913547039032,
      "rewards/semantic_correctness_reward_func/std": 0.22796045243740082,
      "rewards/xmlcount_reward_func/mean": 0.7686249613761902,
      "rewards/xmlcount_reward_func/std": 0.42356839776039124,
      "step": 217
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 318.0,
      "completions/mean_length": 156.70089721679688,
      "completions/mean_terminated_length": 152.81166076660156,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.07443130894968204,
      "grad_norm": 0.021481551229953766,
      "kl": 0.013187408447265625,
      "learning_rate": 9.373543805267367e-06,
      "loss": -0.0126,
      "num_tokens": 77561024.0,
      "reward": 0.4687114655971527,
      "reward_std": 0.06550662219524384,
      "rewards/gemini_judge_reward_func/mean": 0.1227678582072258,
      "rewards/gemini_judge_reward_func/std": 0.21306942403316498,
      "rewards/semantic_correctness_reward_func/mean": 0.4535214602947235,
      "rewards/semantic_correctness_reward_func/std": 0.21218258142471313,
      "rewards/xmlcount_reward_func/mean": 0.8222500681877136,
      "rewards/xmlcount_reward_func/std": 0.3842346966266632,
      "step": 218
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 324.0,
      "completions/mean_length": 170.36607360839844,
      "completions/mean_terminated_length": 150.876708984375,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.07477273697238701,
      "grad_norm": 0.020490756258368492,
      "kl": 0.013939857482910156,
      "learning_rate": 9.365506381941066e-06,
      "loss": 0.0014,
      "num_tokens": 77917414.0,
      "reward": 0.4274297058582306,
      "reward_std": 0.06760236620903015,
      "rewards/gemini_judge_reward_func/mean": 0.1205357164144516,
      "rewards/gemini_judge_reward_func/std": 0.2560526132583618,
      "rewards/semantic_correctness_reward_func/mean": 0.44821077585220337,
      "rewards/semantic_correctness_reward_func/std": 0.21363292634487152,
      "rewards/xmlcount_reward_func/mean": 0.7239331007003784,
      "rewards/xmlcount_reward_func/std": 0.4488573372364044,
      "step": 219
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 453.0,
      "completions/mean_length": 163.45982360839844,
      "completions/mean_terminated_length": 147.81362915039062,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.07511416499509198,
      "grad_norm": 0.021693740040063858,
      "kl": 0.016297340393066406,
      "learning_rate": 9.357421218136387e-06,
      "loss": -0.023,
      "num_tokens": 78286709.0,
      "reward": 0.4039275050163269,
      "reward_std": 0.06309369206428528,
      "rewards/gemini_judge_reward_func/mean": 0.1026785746216774,
      "rewards/gemini_judge_reward_func/std": 0.22474093735218048,
      "rewards/semantic_correctness_reward_func/mean": 0.411083847284317,
      "rewards/semantic_correctness_reward_func/std": 0.20537878572940826,
      "rewards/xmlcount_reward_func/mean": 0.7015982270240784,
      "rewards/xmlcount_reward_func/std": 0.4568972587585449,
      "step": 220
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 329.0,
      "completions/mean_length": 161.10269165039062,
      "completions/mean_terminated_length": 149.38914489746094,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.07545559301779693,
      "grad_norm": 0.021494340151548386,
      "kl": 0.014636516571044922,
      "learning_rate": 9.349288402271387e-06,
      "loss": 0.0103,
      "num_tokens": 78654304.0,
      "reward": 0.40833523869514465,
      "reward_std": 0.06033541262149811,
      "rewards/gemini_judge_reward_func/mean": 0.1194196417927742,
      "rewards/gemini_judge_reward_func/std": 0.24483934044837952,
      "rewards/semantic_correctness_reward_func/mean": 0.4185151755809784,
      "rewards/semantic_correctness_reward_func/std": 0.18770398199558258,
      "rewards/xmlcount_reward_func/mean": 0.6921607255935669,
      "rewards/xmlcount_reward_func/std": 0.46012961864471436,
      "step": 221
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 477.0,
      "completions/mean_length": 163.02679443359375,
      "completions/mean_terminated_length": 155.27027893066406,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.0757970210405019,
      "grad_norm": 0.021399740129709244,
      "kl": 0.015057563781738281,
      "learning_rate": 9.341108023285239e-06,
      "loss": -0.0114,
      "num_tokens": 79000814.0,
      "reward": 0.45225730538368225,
      "reward_std": 0.07958463579416275,
      "rewards/gemini_judge_reward_func/mean": 0.1506696492433548,
      "rewards/gemini_judge_reward_func/std": 0.25258663296699524,
      "rewards/semantic_correctness_reward_func/mean": 0.43167024850845337,
      "rewards/semantic_correctness_reward_func/std": 0.20277100801467896,
      "rewards/xmlcount_reward_func/mean": 0.764138400554657,
      "rewards/xmlcount_reward_func/std": 0.42636701464653015,
      "step": 222
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 556.0,
      "completions/mean_length": 152.04019165039062,
      "completions/mean_terminated_length": 148.1300506591797,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.07613844906320687,
      "grad_norm": 0.023108750581741333,
      "kl": 0.013528823852539062,
      "learning_rate": 9.332880170637252e-06,
      "loss": -0.019,
      "num_tokens": 79331523.0,
      "reward": 0.412973016500473,
      "reward_std": 0.04944463074207306,
      "rewards/gemini_judge_reward_func/mean": 0.0948660746216774,
      "rewards/gemini_judge_reward_func/std": 0.18995627760887146,
      "rewards/semantic_correctness_reward_func/mean": 0.4093826711177826,
      "rewards/semantic_correctness_reward_func/std": 0.19295699894428253,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 223
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 392.0,
      "completions/mean_length": 162.71429443359375,
      "completions/mean_terminated_length": 147.05453491210938,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.07647987708591182,
      "grad_norm": 0.0203098151832819,
      "kl": 0.015305519104003906,
      "learning_rate": 9.324604934305911e-06,
      "loss": 0.0189,
      "num_tokens": 79697987.0,
      "reward": 0.40999144315719604,
      "reward_std": 0.05599633976817131,
      "rewards/gemini_judge_reward_func/mean": 0.0870535746216774,
      "rewards/gemini_judge_reward_func/std": 0.18250198662281036,
      "rewards/semantic_correctness_reward_func/mean": 0.4190196692943573,
      "rewards/semantic_correctness_reward_func/std": 0.19571587443351746,
      "rewards/xmlcount_reward_func/mean": 0.7284151911735535,
      "rewards/xmlcount_reward_func/std": 0.438982218503952,
      "step": 224
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 477.0,
      "completions/mean_length": 154.38394165039062,
      "completions/mean_terminated_length": 146.549560546875,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.07682130510861679,
      "grad_norm": 0.02163609303534031,
      "kl": 0.01602935791015625,
      "learning_rate": 9.31628240478787e-06,
      "loss": -0.0308,
      "num_tokens": 80051057.0,
      "reward": 0.43514472246170044,
      "reward_std": 0.06055814400315285,
      "rewards/gemini_judge_reward_func/mean": 0.1272321492433548,
      "rewards/gemini_judge_reward_func/std": 0.25388360023498535,
      "rewards/semantic_correctness_reward_func/mean": 0.41975903511047363,
      "rewards/semantic_correctness_reward_func/std": 0.186967134475708,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 724.0,
      "completions/mean_length": 184.8303680419922,
      "completions/mean_terminated_length": 165.6712188720703,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.07716273313132176,
      "grad_norm": 0.02016390860080719,
      "kl": 0.016954421997070312,
      "learning_rate": 9.30791267309698e-06,
      "loss": -0.0333,
      "num_tokens": 80438571.0,
      "reward": 0.4161798655986786,
      "reward_std": 0.058609530329704285,
      "rewards/gemini_judge_reward_func/mean": 0.1171875,
      "rewards/gemini_judge_reward_func/std": 0.2168499380350113,
      "rewards/semantic_correctness_reward_func/mean": 0.461211621761322,
      "rewards/semantic_correctness_reward_func/std": 0.2113778293132782,
      "rewards/xmlcount_reward_func/mean": 0.6926562190055847,
      "rewards/xmlcount_reward_func/std": 0.4631781280040741,
      "step": 226
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 165.24554443359375,
      "completions/mean_terminated_length": 145.63926696777344,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.07750416115402671,
      "grad_norm": 0.02275063470005989,
      "kl": 0.015069961547851562,
      "learning_rate": 9.299495830763285e-06,
      "loss": -0.0236,
      "num_tokens": 80815254.0,
      "reward": 0.42212775349617004,
      "reward_std": 0.074894979596138,
      "rewards/gemini_judge_reward_func/mean": 0.1607142835855484,
      "rewards/gemini_judge_reward_func/std": 0.2877489924430847,
      "rewards/semantic_correctness_reward_func/mean": 0.4664600193500519,
      "rewards/semantic_correctness_reward_func/std": 0.22998718917369843,
      "rewards/xmlcount_reward_func/mean": 0.6613750457763672,
      "rewards/xmlcount_reward_func/std": 0.47500184178352356,
      "step": 227
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 545.0,
      "completions/mean_length": 171.2678680419922,
      "completions/mean_terminated_length": 155.7636260986328,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.07784558917673168,
      "grad_norm": 0.02133053168654442,
      "kl": 0.01563549041748047,
      "learning_rate": 9.291031969832026e-06,
      "loss": -0.0118,
      "num_tokens": 81154166.0,
      "reward": 0.4396763741970062,
      "reward_std": 0.0586925633251667,
      "rewards/gemini_judge_reward_func/mean": 0.0837053582072258,
      "rewards/gemini_judge_reward_func/std": 0.1878366768360138,
      "rewards/semantic_correctness_reward_func/mean": 0.42222103476524353,
      "rewards/semantic_correctness_reward_func/std": 0.2082173377275467,
      "rewards/xmlcount_reward_func/mean": 0.8043751120567322,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 228
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 160.46429443359375,
      "completions/mean_terminated_length": 156.59193420410156,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.07818701719943665,
      "grad_norm": 0.020135052502155304,
      "kl": 0.014910221099853516,
      "learning_rate": 9.28252118286263e-06,
      "loss": -0.0066,
      "num_tokens": 81518406.0,
      "reward": 0.3965265154838562,
      "reward_std": 0.05617070570588112,
      "rewards/gemini_judge_reward_func/mean": 0.1138392835855484,
      "rewards/gemini_judge_reward_func/std": 0.22432857751846313,
      "rewards/semantic_correctness_reward_func/mean": 0.43941816687583923,
      "rewards/semantic_correctness_reward_func/std": 0.19124464690685272,
      "rewards/xmlcount_reward_func/mean": 0.6577678322792053,
      "rewards/xmlcount_reward_func/std": 0.47394242882728577,
      "step": 229
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 890.0,
      "completions/mean_length": 168.90179443359375,
      "completions/mean_terminated_length": 149.3789825439453,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.07852844522214161,
      "grad_norm": 0.021697277203202248,
      "kl": 0.013443470001220703,
      "learning_rate": 9.273963562927695e-06,
      "loss": 0.007,
      "num_tokens": 81881684.0,
      "reward": 0.44313669204711914,
      "reward_std": 0.06279011070728302,
      "rewards/gemini_judge_reward_func/mean": 0.1316964328289032,
      "rewards/gemini_judge_reward_func/std": 0.2482219636440277,
      "rewards/semantic_correctness_reward_func/mean": 0.4566476047039032,
      "rewards/semantic_correctness_reward_func/std": 0.18283917009830475,
      "rewards/xmlcount_reward_func/mean": 0.7478214502334595,
      "rewards/xmlcount_reward_func/std": 0.43387457728385925,
      "step": 230
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 422.0,
      "completions/mean_length": 167.49554443359375,
      "completions/mean_terminated_length": 147.9406280517578,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.07886987324484657,
      "grad_norm": 0.020851455628871918,
      "kl": 0.014322280883789062,
      "learning_rate": 9.265359203611988e-06,
      "loss": -0.0232,
      "num_tokens": 82245263.0,
      "reward": 0.4470902383327484,
      "reward_std": 0.06255059689283371,
      "rewards/gemini_judge_reward_func/mean": 0.1127232164144516,
      "rewards/gemini_judge_reward_func/std": 0.220489040017128,
      "rewards/semantic_correctness_reward_func/mean": 0.43700453639030457,
      "rewards/semantic_correctness_reward_func/std": 0.2078365534543991,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 231
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 488.0,
      "completions/mean_length": 175.83482360839844,
      "completions/mean_terminated_length": 152.49081420898438,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.07921130126755153,
      "grad_norm": 0.02140904776751995,
      "kl": 0.020357608795166016,
      "learning_rate": 9.256708199011402e-06,
      "loss": -0.0233,
      "num_tokens": 82588374.0,
      "reward": 0.38504737615585327,
      "reward_std": 0.07159604132175446,
      "rewards/gemini_judge_reward_func/mean": 0.1060267835855484,
      "rewards/gemini_judge_reward_func/std": 0.2383553832769394,
      "rewards/semantic_correctness_reward_func/mean": 0.3775940239429474,
      "rewards/semantic_correctness_reward_func/std": 0.18888156116008759,
      "rewards/xmlcount_reward_func/mean": 0.6677946448326111,
      "rewards/xmlcount_reward_func/std": 0.468395471572876,
      "step": 232
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 476.0,
      "completions/mean_length": 159.8169708251953,
      "completions/mean_terminated_length": 144.1045379638672,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.0795527292902565,
      "grad_norm": 0.021409912034869194,
      "kl": 0.015997886657714844,
      "learning_rate": 9.248010643731936e-06,
      "loss": -0.0371,
      "num_tokens": 82966821.0,
      "reward": 0.40062811970710754,
      "reward_std": 0.0587112195789814,
      "rewards/gemini_judge_reward_func/mean": 0.09375,
      "rewards/gemini_judge_reward_func/std": 0.2012433260679245,
      "rewards/semantic_correctness_reward_func/mean": 0.411104679107666,
      "rewards/semantic_correctness_reward_func/std": 0.20091889798641205,
      "rewards/xmlcount_reward_func/mean": 0.7022679448127747,
      "rewards/xmlcount_reward_func/std": 0.45671790838241577,
      "step": 233
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 527.0,
      "completions/mean_length": 165.28125,
      "completions/mean_terminated_length": 145.67579650878906,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.07989415731296146,
      "grad_norm": 0.02124546654522419,
      "kl": 0.014566898345947266,
      "learning_rate": 9.23926663288866e-06,
      "loss": -0.0139,
      "num_tokens": 83334656.0,
      "reward": 0.39910420775413513,
      "reward_std": 0.06146840751171112,
      "rewards/gemini_judge_reward_func/mean": 0.1238839253783226,
      "rewards/gemini_judge_reward_func/std": 0.2538909912109375,
      "rewards/semantic_correctness_reward_func/mean": 0.43394067883491516,
      "rewards/semantic_correctness_reward_func/std": 0.226360023021698,
      "rewards/xmlcount_reward_func/mean": 0.656906247138977,
      "rewards/xmlcount_reward_func/std": 0.47649866342544556,
      "step": 234
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 727.0,
      "completions/mean_length": 151.41519165039062,
      "completions/mean_terminated_length": 139.5701446533203,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.08023558533566642,
      "grad_norm": 0.022864429280161858,
      "kl": 0.016307353973388672,
      "learning_rate": 9.230476262104678e-06,
      "loss": -0.0324,
      "num_tokens": 83697741.0,
      "reward": 0.40850719809532166,
      "reward_std": 0.06750909239053726,
      "rewards/gemini_judge_reward_func/mean": 0.1294642835855484,
      "rewards/gemini_judge_reward_func/std": 0.22078678011894226,
      "rewards/semantic_correctness_reward_func/mean": 0.42064300179481506,
      "rewards/semantic_correctness_reward_func/std": 0.20097774267196655,
      "rewards/xmlcount_reward_func/mean": 0.681482195854187,
      "rewards/xmlcount_reward_func/std": 0.4664749205112457,
      "step": 235
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0401785714285714,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 986.0,
      "completions/mean_length": 185.51339721679688,
      "completions/mean_terminated_length": 150.41395568847656,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.08057701335837139,
      "grad_norm": 0.021365147083997726,
      "kl": 0.013797760009765625,
      "learning_rate": 9.221639627510076e-06,
      "loss": -0.0213,
      "num_tokens": 84090224.0,
      "reward": 0.3859476149082184,
      "reward_std": 0.06201120465993881,
      "rewards/gemini_judge_reward_func/mean": 0.1037946417927742,
      "rewards/gemini_judge_reward_func/std": 0.21724192798137665,
      "rewards/semantic_correctness_reward_func/mean": 0.4172736704349518,
      "rewards/semantic_correctness_reward_func/std": 0.20381608605384827,
      "rewards/xmlcount_reward_func/mean": 0.6524375677108765,
      "rewards/xmlcount_reward_func/std": 0.47794878482818604,
      "step": 236
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 459.0,
      "completions/mean_length": 153.1428680419922,
      "completions/mean_terminated_length": 145.29730224609375,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.08091844138107636,
      "grad_norm": 0.021086620166897774,
      "kl": 0.014497756958007812,
      "learning_rate": 9.212756825740874e-06,
      "loss": -0.0396,
      "num_tokens": 84447016.0,
      "reward": 0.434138685464859,
      "reward_std": 0.07237514853477478,
      "rewards/gemini_judge_reward_func/mean": 0.1171875,
      "rewards/gemini_judge_reward_func/std": 0.24015435576438904,
      "rewards/semantic_correctness_reward_func/mean": 0.3990683853626251,
      "rewards/semantic_correctness_reward_func/std": 0.18283356726169586,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 237
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 635.0,
      "completions/mean_length": 163.5625,
      "completions/mean_terminated_length": 151.88235473632812,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.08125986940378131,
      "grad_norm": 0.02173583023250103,
      "kl": 0.01507568359375,
      "learning_rate": 9.203827953937969e-06,
      "loss": 0.0123,
      "num_tokens": 84820046.0,
      "reward": 0.40177983045578003,
      "reward_std": 0.06776406615972519,
      "rewards/gemini_judge_reward_func/mean": 0.1071428582072258,
      "rewards/gemini_judge_reward_func/std": 0.2162519097328186,
      "rewards/semantic_correctness_reward_func/mean": 0.40036338567733765,
      "rewards/semantic_correctness_reward_func/std": 0.22206875681877136,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903892993927,
      "step": 238
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 889.0,
      "completions/mean_length": 163.17857360839844,
      "completions/mean_terminated_length": 147.5272674560547,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.08160129742648628,
      "grad_norm": 0.02061956562101841,
      "kl": 0.019533157348632812,
      "learning_rate": 9.194853109746073e-06,
      "loss": -0.0204,
      "num_tokens": 85190930.0,
      "reward": 0.354619562625885,
      "reward_std": 0.0518239289522171,
      "rewards/gemini_judge_reward_func/mean": 0.0736607164144516,
      "rewards/gemini_judge_reward_func/std": 0.16450220346450806,
      "rewards/semantic_correctness_reward_func/mean": 0.3745262324810028,
      "rewards/semantic_correctness_reward_func/std": 0.1803187131881714,
      "rewards/xmlcount_reward_func/mean": 0.6256250739097595,
      "rewards/xmlcount_reward_func/std": 0.48569241166114807,
      "step": 239
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 730.0,
      "completions/mean_length": 163.87054443359375,
      "completions/mean_terminated_length": 156.1216278076172,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.08194272544919125,
      "grad_norm": 0.02133871614933014,
      "kl": 0.0135650634765625,
      "learning_rate": 9.185832391312644e-06,
      "loss": -0.0231,
      "num_tokens": 85531313.0,
      "reward": 0.4272550642490387,
      "reward_std": 0.06283900886774063,
      "rewards/gemini_judge_reward_func/mean": 0.0870535746216774,
      "rewards/gemini_judge_reward_func/std": 0.20965971052646637,
      "rewards/semantic_correctness_reward_func/mean": 0.38916799426078796,
      "rewards/semantic_correctness_reward_func/std": 0.1963030993938446,
      "rewards/xmlcount_reward_func/mean": 0.7864999771118164,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 240
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 516.0,
      "completions/mean_length": 163.3616180419922,
      "completions/mean_terminated_length": 155.6081085205078,
      "completions/min_length": 50.0,
      "completions/min_terminated_length": 50.0,
      "epoch": 0.0822841534718962,
      "grad_norm": 0.01960454136133194,
      "kl": 0.013911247253417969,
      "learning_rate": 9.176765897286812e-06,
      "loss": -0.0249,
      "num_tokens": 85890730.0,
      "reward": 0.4563038647174835,
      "reward_std": 0.06880877912044525,
      "rewards/gemini_judge_reward_func/mean": 0.1551339328289032,
      "rewards/gemini_judge_reward_func/std": 0.2734237015247345,
      "rewards/semantic_correctness_reward_func/mean": 0.4544209837913513,
      "rewards/semantic_correctness_reward_func/std": 0.2353294938802719,
      "rewards/xmlcount_reward_func/mean": 0.7584152221679688,
      "rewards/xmlcount_reward_func/std": 0.42483606934547424,
      "step": 241
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0357142857142857,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 288.0,
      "completions/mean_length": 178.0982208251953,
      "completions/mean_terminated_length": 146.76852416992188,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.08262558149460117,
      "grad_norm": 0.024442723020911217,
      "kl": 0.017345428466796875,
      "learning_rate": 9.167653726818305e-06,
      "loss": -0.009,
      "num_tokens": 86252752.0,
      "reward": 0.43084582686424255,
      "reward_std": 0.07760713994503021,
      "rewards/gemini_judge_reward_func/mean": 0.1283482164144516,
      "rewards/gemini_judge_reward_func/std": 0.24142225086688995,
      "rewards/semantic_correctness_reward_func/mean": 0.4139074683189392,
      "rewards/semantic_correctness_reward_func/std": 0.230632945895195,
      "rewards/xmlcount_reward_func/mean": 0.7418125867843628,
      "rewards/xmlcount_reward_func/std": 0.4394664168357849,
      "step": 242
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 306.0,
      "completions/mean_length": 147.71429443359375,
      "completions/mean_terminated_length": 139.81982421875,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.08296700951730614,
      "grad_norm": 0.02121170423924923,
      "kl": 0.014449596405029297,
      "learning_rate": 9.15849597955636e-06,
      "loss": -0.0202,
      "num_tokens": 86610004.0,
      "reward": 0.4123784005641937,
      "reward_std": 0.06092296540737152,
      "rewards/gemini_judge_reward_func/mean": 0.1272321492433548,
      "rewards/gemini_judge_reward_func/std": 0.2668020725250244,
      "rewards/semantic_correctness_reward_func/mean": 0.413177490234375,
      "rewards/semantic_correctness_reward_func/std": 0.19070591032505035,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903594970703,
      "step": 243
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 792.0,
      "completions/mean_length": 158.5178680419922,
      "completions/mean_terminated_length": 150.72071838378906,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.08330843754001109,
      "grad_norm": 0.02672337368130684,
      "kl": 0.01831817626953125,
      "learning_rate": 9.149292755648631e-06,
      "loss": -0.0392,
      "num_tokens": 86964124.0,
      "reward": 0.4503079354763031,
      "reward_std": 0.08422276377677917,
      "rewards/gemini_judge_reward_func/mean": 0.1618303507566452,
      "rewards/gemini_judge_reward_func/std": 0.26949670910835266,
      "rewards/semantic_correctness_reward_func/mean": 0.4621289074420929,
      "rewards/semantic_correctness_reward_func/std": 0.21201936900615692,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 427.0,
      "completions/mean_length": 172.85269165039062,
      "completions/mean_terminated_length": 145.3963165283203,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.08364986556271606,
      "grad_norm": 0.02006138674914837,
      "kl": 0.01513051986694336,
      "learning_rate": 9.140044155740102e-06,
      "loss": 0.0065,
      "num_tokens": 87345575.0,
      "reward": 0.36606886982917786,
      "reward_std": 0.058657389134168625,
      "rewards/gemini_judge_reward_func/mean": 0.09375,
      "rewards/gemini_judge_reward_func/std": 0.20809029042720795,
      "rewards/semantic_correctness_reward_func/mean": 0.42964765429496765,
      "rewards/semantic_correctness_reward_func/std": 0.19854861497879028,
      "rewards/xmlcount_reward_func/mean": 0.6065982580184937,
      "rewards/xmlcount_reward_func/std": 0.4893246293067932,
      "step": 245
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 732.0,
      "completions/mean_length": 151.75894165039062,
      "completions/mean_terminated_length": 143.90090942382812,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.08399129358542103,
      "grad_norm": 0.022956129163503647,
      "kl": 0.01905345916748047,
      "learning_rate": 9.130750280971978e-06,
      "loss": -0.0355,
      "num_tokens": 87729765.0,
      "reward": 0.392170250415802,
      "reward_std": 0.06233404949307442,
      "rewards/gemini_judge_reward_func/mean": 0.1138392835855484,
      "rewards/gemini_judge_reward_func/std": 0.24464760720729828,
      "rewards/semantic_correctness_reward_func/mean": 0.3992529511451721,
      "rewards/semantic_correctness_reward_func/std": 0.21186232566833496,
      "rewards/xmlcount_reward_func/mean": 0.666959822177887,
      "rewards/xmlcount_reward_func/std": 0.47216716408729553,
      "step": 246
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 876.0,
      "completions/mean_length": 165.2232208251953,
      "completions/mean_terminated_length": 161.37220764160156,
      "completions/min_length": 44.0,
      "completions/min_terminated_length": 44.0,
      "epoch": 0.084332721608126,
      "grad_norm": 0.01989656873047352,
      "kl": 0.01374053955078125,
      "learning_rate": 9.121411232980589e-06,
      "loss": -0.0314,
      "num_tokens": 88087375.0,
      "reward": 0.4254680275917053,
      "reward_std": 0.0688747987151146,
      "rewards/gemini_judge_reward_func/mean": 0.1328125,
      "rewards/gemini_judge_reward_func/std": 0.26031482219696045,
      "rewards/semantic_correctness_reward_func/mean": 0.41387563943862915,
      "rewards/semantic_correctness_reward_func/std": 0.21328496932983398,
      "rewards/xmlcount_reward_func/mean": 0.723919689655304,
      "rewards/xmlcount_reward_func/std": 0.4488491117954254,
      "step": 247
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 620.0,
      "completions/mean_length": 165.77679443359375,
      "completions/mean_terminated_length": 150.1727294921875,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.08467414963083095,
      "grad_norm": 0.021479859948158264,
      "kl": 0.015668630599975586,
      "learning_rate": 9.112027113896262e-06,
      "loss": -0.0151,
      "num_tokens": 88441945.0,
      "reward": 0.4406365156173706,
      "reward_std": 0.0737806111574173,
      "rewards/gemini_judge_reward_func/mean": 0.1305803507566452,
      "rewards/gemini_judge_reward_func/std": 0.24369193613529205,
      "rewards/semantic_correctness_reward_func/mean": 0.4360485374927521,
      "rewards/semantic_correctness_reward_func/std": 0.21203678846359253,
      "rewards/xmlcount_reward_func/mean": 0.7529866099357605,
      "rewards/xmlcount_reward_func/std": 0.4318158030509949,
      "step": 248
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 528.0,
      "completions/mean_length": 170.93304443359375,
      "completions/mean_terminated_length": 151.4566192626953,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "epoch": 0.08501557765353591,
      "grad_norm": 0.020318983122706413,
      "kl": 0.015210866928100586,
      "learning_rate": 9.102598026342223e-06,
      "loss": -0.0411,
      "num_tokens": 88798298.0,
      "reward": 0.4054628312587738,
      "reward_std": 0.06705626100301743,
      "rewards/gemini_judge_reward_func/mean": 0.1104910746216774,
      "rewards/gemini_judge_reward_func/std": 0.24100728332996368,
      "rewards/semantic_correctness_reward_func/mean": 0.37633195519447327,
      "rewards/semantic_correctness_reward_func/std": 0.21562041342258453,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 249
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 723.0,
      "completions/mean_length": 162.27679443359375,
      "completions/mean_terminated_length": 150.57919311523438,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "epoch": 0.08535700567624088,
      "grad_norm": 0.02207431197166443,
      "kl": 0.01397848129272461,
      "learning_rate": 9.093124073433464e-06,
      "loss": -0.0073,
      "num_tokens": 89144892.0,
      "reward": 0.42173489928245544,
      "reward_std": 0.053448598831892014,
      "rewards/gemini_judge_reward_func/mean": 0.109375,
      "rewards/gemini_judge_reward_func/std": 0.21771657466888428,
      "rewards/semantic_correctness_reward_func/mean": 0.42417430877685547,
      "rewards/semantic_correctness_reward_func/std": 0.19368620216846466,
      "rewards/xmlcount_reward_func/mean": 0.7328750491142273,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 250
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 459.0,
      "completions/mean_length": 158.2366180419922,
      "completions/mean_terminated_length": 154.35426330566406,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.08569843369894584,
      "grad_norm": 0.021455751731991768,
      "kl": 0.015799522399902344,
      "learning_rate": 9.083605358775612e-06,
      "loss": -0.0366,
      "num_tokens": 89481561.0,
      "reward": 0.44944095611572266,
      "reward_std": 0.06190529465675354,
      "rewards/gemini_judge_reward_func/mean": 0.1104910746216774,
      "rewards/gemini_judge_reward_func/std": 0.22538302838802338,
      "rewards/semantic_correctness_reward_func/mean": 0.4174725115299225,
      "rewards/semantic_correctness_reward_func/std": 0.19083231687545776,
      "rewards/xmlcount_reward_func/mean": 0.8043750524520874,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 251
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 175.04464721679688,
      "completions/mean_terminated_length": 151.67889404296875,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.0860398617216508,
      "grad_norm": 0.021432137116789818,
      "kl": 0.015732288360595703,
      "learning_rate": 9.074041986463808e-06,
      "loss": -0.0306,
      "num_tokens": 89851651.0,
      "reward": 0.40240949392318726,
      "reward_std": 0.06158284842967987,
      "rewards/gemini_judge_reward_func/mean": 0.1205357164144516,
      "rewards/gemini_judge_reward_func/std": 0.2527475655078888,
      "rewards/semantic_correctness_reward_func/mean": 0.3960294723510742,
      "rewards/semantic_correctness_reward_func/std": 0.21675321459770203,
      "rewards/xmlcount_reward_func/mean": 0.6874732971191406,
      "rewards/xmlcount_reward_func/std": 0.46465516090393066,
      "step": 252
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 371.0,
      "completions/mean_length": 174.13839721679688,
      "completions/mean_terminated_length": 150.74769592285156,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.08638128974435577,
      "grad_norm": 0.019777290523052216,
      "kl": 0.015145301818847656,
      "learning_rate": 9.064434061081562e-06,
      "loss": -0.0371,
      "num_tokens": 90212102.0,
      "reward": 0.40325623750686646,
      "reward_std": 0.04822305217385292,
      "rewards/gemini_judge_reward_func/mean": 0.1049107164144516,
      "rewards/gemini_judge_reward_func/std": 0.24060459434986115,
      "rewards/semantic_correctness_reward_func/mean": 0.4479595124721527,
      "rewards/semantic_correctness_reward_func/std": 0.20428718626499176,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 253
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 363.0,
      "completions/mean_length": 165.60714721679688,
      "completions/mean_terminated_length": 157.87387084960938,
      "completions/min_length": 32.0,
      "completions/min_terminated_length": 32.0,
      "epoch": 0.08672271776706073,
      "grad_norm": 0.021202484145760536,
      "kl": 0.012971878051757812,
      "learning_rate": 9.0547816876996e-06,
      "loss": -0.0236,
      "num_tokens": 90555198.0,
      "reward": 0.4358048439025879,
      "reward_std": 0.06684070080518723,
      "rewards/gemini_judge_reward_func/mean": 0.1149553582072258,
      "rewards/gemini_judge_reward_func/std": 0.22562715411186218,
      "rewards/semantic_correctness_reward_func/mean": 0.4118633270263672,
      "rewards/semantic_correctness_reward_func/std": 0.22267456352710724,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 254
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 917.0,
      "completions/mean_length": 171.31251525878906,
      "completions/mean_terminated_length": 155.80908203125,
      "completions/min_length": 64.0,
      "completions/min_terminated_length": 64.0,
      "epoch": 0.08706414578976569,
      "grad_norm": 0.02179318107664585,
      "kl": 0.012566328048706055,
      "learning_rate": 9.045084971874738e-06,
      "loss": -0.0163,
      "num_tokens": 90887740.0,
      "reward": 0.4795321226119995,
      "reward_std": 0.07366758584976196,
      "rewards/gemini_judge_reward_func/mean": 0.1484375,
      "rewards/gemini_judge_reward_func/std": 0.25720080733299255,
      "rewards/semantic_correctness_reward_func/mean": 0.4562854766845703,
      "rewards/semantic_correctness_reward_func/std": 0.21975703537464142,
      "rewards/xmlcount_reward_func/mean": 0.8222500085830688,
      "rewards/xmlcount_reward_func/std": 0.3842346966266632,
      "step": 255
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 168.32589721679688,
      "completions/mean_terminated_length": 160.61712646484375,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "epoch": 0.08740557381247066,
      "grad_norm": 0.020558955147862434,
      "kl": 0.012400150299072266,
      "learning_rate": 9.035344019648701e-06,
      "loss": -0.0201,
      "num_tokens": 91228273.0,
      "reward": 0.4610043466091156,
      "reward_std": 0.06945549696683884,
      "rewards/gemini_judge_reward_func/mean": 0.1584821492433548,
      "rewards/gemini_judge_reward_func/std": 0.2750774919986725,
      "rewards/semantic_correctness_reward_func/mean": 0.4508070945739746,
      "rewards/semantic_correctness_reward_func/std": 0.2163994014263153,
      "rewards/xmlcount_reward_func/mean": 0.7686249613761902,
      "rewards/xmlcount_reward_func/std": 0.42356839776039124,
      "step": 256
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 444.0,
      "completions/mean_length": 172.97769165039062,
      "completions/mean_terminated_length": 153.54794311523438,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.08774700183517563,
      "grad_norm": 0.021823478862643242,
      "kl": 0.012814998626708984,
      "learning_rate": 9.025558937546987e-06,
      "loss": -0.0005,
      "num_tokens": 91594656.0,
      "reward": 0.47361400723457336,
      "reward_std": 0.0772940143942833,
      "rewards/gemini_judge_reward_func/mean": 0.1395089328289032,
      "rewards/gemini_judge_reward_func/std": 0.24901461601257324,
      "rewards/semantic_correctness_reward_func/mean": 0.444551944732666,
      "rewards/semantic_correctness_reward_func/std": 0.21275341510772705,
      "rewards/xmlcount_reward_func/mean": 0.8222500681877136,
      "rewards/xmlcount_reward_func/std": 0.3842346966266632,
      "step": 257
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 586.0,
      "completions/mean_length": 172.4107208251953,
      "completions/mean_terminated_length": 148.97247314453125,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.08808842985788058,
      "grad_norm": 0.020453812554478645,
      "kl": 0.013701677322387695,
      "learning_rate": 9.015729832577681e-06,
      "loss": 0.0261,
      "num_tokens": 91975412.0,
      "reward": 0.4378080368041992,
      "reward_std": 0.08023884147405624,
      "rewards/gemini_judge_reward_func/mean": 0.1551339328289032,
      "rewards/gemini_judge_reward_func/std": 0.28348881006240845,
      "rewards/semantic_correctness_reward_func/mean": 0.4487721025943756,
      "rewards/semantic_correctness_reward_func/std": 0.23882971704006195,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 258
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 671.0,
      "completions/mean_length": 167.9866180419922,
      "completions/mean_terminated_length": 156.36651611328125,
      "completions/min_length": 59.0,
      "completions/min_terminated_length": 59.0,
      "epoch": 0.08842985788058555,
      "grad_norm": 0.020137522369623184,
      "kl": 0.013067245483398438,
      "learning_rate": 9.005856812230304e-06,
      "loss": -0.0241,
      "num_tokens": 92304397.0,
      "reward": 0.4488251805305481,
      "reward_std": 0.04712558910250664,
      "rewards/gemini_judge_reward_func/mean": 0.1037946417927742,
      "rewards/gemini_judge_reward_func/std": 0.21201863884925842,
      "rewards/semantic_correctness_reward_func/mean": 0.42778635025024414,
      "rewards/semantic_correctness_reward_func/std": 0.20638985931873322,
      "rewards/xmlcount_reward_func/mean": 0.8043751120567322,
      "rewards/xmlcount_reward_func/std": 0.39858436584472656,
      "step": 259
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 780.0,
      "completions/mean_length": 180.9419708251953,
      "completions/mean_terminated_length": 161.69406127929688,
      "completions/min_length": 49.0,
      "completions/min_terminated_length": 49.0,
      "epoch": 0.08877128590329052,
      "grad_norm": 0.020242715254426003,
      "kl": 0.012917518615722656,
      "learning_rate": 8.995939984474624e-06,
      "loss": 0.0083,
      "num_tokens": 92643564.0,
      "reward": 0.4373071789741516,
      "reward_std": 0.06374209374189377,
      "rewards/gemini_judge_reward_func/mean": 0.1104910746216774,
      "rewards/gemini_judge_reward_func/std": 0.22035281360149384,
      "rewards/semantic_correctness_reward_func/mean": 0.42386600375175476,
      "rewards/semantic_correctness_reward_func/std": 0.19683243334293365,
      "rewards/xmlcount_reward_func/mean": 0.7708438038825989,
      "rewards/xmlcount_reward_func/std": 0.4208168685436249,
      "step": 260
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 379.0,
      "completions/mean_length": 163.7544708251953,
      "completions/mean_terminated_length": 148.11363220214844,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "epoch": 0.08911271392599547,
      "grad_norm": 0.02077900432050228,
      "kl": 0.011655330657958984,
      "learning_rate": 8.98597945775948e-06,
      "loss": 0.0103,
      "num_tokens": 92977885.0,
      "reward": 0.4542004466056824,
      "reward_std": 0.0714111477136612,
      "rewards/gemini_judge_reward_func/mean": 0.1316964328289032,
      "rewards/gemini_judge_reward_func/std": 0.2330818623304367,
      "rewards/semantic_correctness_reward_func/mean": 0.43685027956962585,
      "rewards/semantic_correctness_reward_func/std": 0.2025267481803894,
      "rewards/xmlcount_reward_func/mean": 0.7853795289993286,
      "rewards/xmlcount_reward_func/std": 0.41141119599342346,
      "step": 261
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 318.0,
      "completions/mean_length": 151.34375,
      "completions/mean_terminated_length": 147.4304962158203,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.08945414194870044,
      "grad_norm": 0.022191418334841728,
      "kl": 0.013633251190185547,
      "learning_rate": 8.975975341011595e-06,
      "loss": -0.0129,
      "num_tokens": 93303462.0,
      "reward": 0.4570338726043701,
      "reward_std": 0.07553589344024658,
      "rewards/gemini_judge_reward_func/mean": 0.1194196417927742,
      "rewards/gemini_judge_reward_func/std": 0.22453764081001282,
      "rewards/semantic_correctness_reward_func/mean": 0.4375797212123871,
      "rewards/semantic_correctness_reward_func/std": 0.1922610104084015,
      "rewards/xmlcount_reward_func/mean": 0.8043750524520874,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 262
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 941.0,
      "completions/mean_length": 161.97769165039062,
      "completions/mean_terminated_length": 154.2117156982422,
      "completions/min_length": 38.0,
      "completions/min_terminated_length": 38.0,
      "epoch": 0.0897955699714054,
      "grad_norm": 0.01960793137550354,
      "kl": 0.01739645004272461,
      "learning_rate": 8.96592774363439e-06,
      "loss": -0.0111,
      "num_tokens": 93676765.0,
      "reward": 0.3955410122871399,
      "reward_std": 0.06413434445858002,
      "rewards/gemini_judge_reward_func/mean": 0.1004464253783226,
      "rewards/gemini_judge_reward_func/std": 0.21559134125709534,
      "rewards/semantic_correctness_reward_func/mean": 0.41831207275390625,
      "rewards/semantic_correctness_reward_func/std": 0.17913205921649933,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 263
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 911.0,
      "completions/mean_length": 159.05357360839844,
      "completions/mean_terminated_length": 155.17489624023438,
      "completions/min_length": 49.0,
      "completions/min_terminated_length": 49.0,
      "epoch": 0.09013699799411036,
      "grad_norm": 0.020866382867097855,
      "kl": 0.012538671493530273,
      "learning_rate": 8.955836775506776e-06,
      "loss": -0.0182,
      "num_tokens": 94019389.0,
      "reward": 0.4702732563018799,
      "reward_std": 0.07895836979150772,
      "rewards/gemini_judge_reward_func/mean": 0.1473214328289032,
      "rewards/gemini_judge_reward_func/std": 0.2731512784957886,
      "rewards/semantic_correctness_reward_func/mean": 0.4479733109474182,
      "rewards/semantic_correctness_reward_func/std": 0.2174108326435089,
      "rewards/xmlcount_reward_func/mean": 0.8043750524520874,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 264
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 736.0,
      "completions/mean_length": 185.57589721679688,
      "completions/mean_terminated_length": 162.5,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "epoch": 0.09047842601681533,
      "grad_norm": 0.019125469028949738,
      "kl": 0.013179779052734375,
      "learning_rate": 8.94570254698197e-06,
      "loss": -0.0534,
      "num_tokens": 94374994.0,
      "reward": 0.41334468126296997,
      "reward_std": 0.0682515949010849,
      "rewards/gemini_judge_reward_func/mean": 0.0904017835855484,
      "rewards/gemini_judge_reward_func/std": 0.23300401866436005,
      "rewards/semantic_correctness_reward_func/mean": 0.38441964983940125,
      "rewards/semantic_correctness_reward_func/std": 0.19101744890213013,
      "rewards/xmlcount_reward_func/mean": 0.7507500648498535,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 265
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 427.0,
      "completions/mean_length": 167.33482360839844,
      "completions/mean_terminated_length": 155.7058868408203,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.0908198540395203,
      "grad_norm": 0.020079247653484344,
      "kl": 0.015445232391357422,
      "learning_rate": 8.935525168886263e-06,
      "loss": -0.0174,
      "num_tokens": 94729441.0,
      "reward": 0.4319685995578766,
      "reward_std": 0.05235441401600838,
      "rewards/gemini_judge_reward_func/mean": 0.109375,
      "rewards/gemini_judge_reward_func/std": 0.2290094941854477,
      "rewards/semantic_correctness_reward_func/mean": 0.40384286642074585,
      "rewards/semantic_correctness_reward_func/std": 0.20472340285778046,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 266
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 924.0,
      "completions/mean_length": 161.7366180419922,
      "completions/mean_terminated_length": 153.96847534179688,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.09116128206222526,
      "grad_norm": 0.02106994017958641,
      "kl": 0.015232563018798828,
      "learning_rate": 8.92530475251784e-06,
      "loss": -0.0094,
      "num_tokens": 95086954.0,
      "reward": 0.4232273995876312,
      "reward_std": 0.07202961295843124,
      "rewards/gemini_judge_reward_func/mean": 0.1540178507566452,
      "rewards/gemini_judge_reward_func/std": 0.2704537510871887,
      "rewards/semantic_correctness_reward_func/mean": 0.44960108399391174,
      "rewards/semantic_correctness_reward_func/std": 0.2044912576675415,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 267
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 330.0,
      "completions/mean_length": 164.90625,
      "completions/mean_terminated_length": 149.28636169433594,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.09150271008493022,
      "grad_norm": 0.02133958414196968,
      "kl": 0.014173030853271484,
      "learning_rate": 8.91504140964553e-06,
      "loss": -0.0226,
      "num_tokens": 95457245.0,
      "reward": 0.446917325258255,
      "reward_std": 0.06879051774740219,
      "rewards/gemini_judge_reward_func/mean": 0.1160714253783226,
      "rewards/gemini_judge_reward_func/std": 0.23180073499679565,
      "rewards/semantic_correctness_reward_func/mean": 0.4235150218009949,
      "rewards/semantic_correctness_reward_func/std": 0.21951408684253693,
      "rewards/xmlcount_reward_func/mean": 0.7894643545150757,
      "rewards/xmlcount_reward_func/std": 0.4070565700531006,
      "step": 268
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 388.0,
      "completions/mean_length": 163.02679443359375,
      "completions/mean_terminated_length": 151.33937072753906,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "epoch": 0.09184413810763518,
      "grad_norm": 0.02040465921163559,
      "kl": 0.014896392822265625,
      "learning_rate": 8.90473525250761e-06,
      "loss": -0.005,
      "num_tokens": 95823515.0,
      "reward": 0.44515278935432434,
      "reward_std": 0.0725380927324295,
      "rewards/gemini_judge_reward_func/mean": 0.1417410671710968,
      "rewards/gemini_judge_reward_func/std": 0.26416993141174316,
      "rewards/semantic_correctness_reward_func/mean": 0.4765315651893616,
      "rewards/semantic_correctness_reward_func/std": 0.21309268474578857,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427841901779175,
      "step": 269
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 439.0,
      "completions/mean_length": 174.46429443359375,
      "completions/mean_terminated_length": 151.0825653076172,
      "completions/min_length": 36.0,
      "completions/min_terminated_length": 36.0,
      "epoch": 0.09218556613034015,
      "grad_norm": 0.020097464323043823,
      "kl": 0.018402576446533203,
      "learning_rate": 8.894386393810563e-06,
      "loss": -0.011,
      "num_tokens": 96182759.0,
      "reward": 0.3942418396472931,
      "reward_std": 0.0597333200275898,
      "rewards/gemini_judge_reward_func/mean": 0.1395089328289032,
      "rewards/gemini_judge_reward_func/std": 0.2797568440437317,
      "rewards/semantic_correctness_reward_func/mean": 0.4409410059452057,
      "rewards/semantic_correctness_reward_func/std": 0.21194607019424438,
      "rewards/xmlcount_reward_func/mean": 0.6256250739097595,
      "rewards/xmlcount_reward_func/std": 0.48569241166114807,
      "step": 270
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 715.0,
      "completions/mean_length": 161.52232360839844,
      "completions/mean_terminated_length": 153.75225830078125,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.0925269941530451,
      "grad_norm": 0.021590745076537132,
      "kl": 0.016841650009155273,
      "learning_rate": 8.883994946727848e-06,
      "loss": -0.0153,
      "num_tokens": 96539908.0,
      "reward": 0.4480898380279541,
      "reward_std": 0.060927197337150574,
      "rewards/gemini_judge_reward_func/mean": 0.1216517835855484,
      "rewards/gemini_judge_reward_func/std": 0.2379140406847,
      "rewards/semantic_correctness_reward_func/mean": 0.4241454303264618,
      "rewards/semantic_correctness_reward_func/std": 0.20089684426784515,
      "rewards/xmlcount_reward_func/mean": 0.786500096321106,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 271
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 268.0,
      "completions/mean_length": 153.83929443359375,
      "completions/mean_terminated_length": 146.0,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.09286842217575007,
      "grad_norm": 0.021431416273117065,
      "kl": 0.015526533126831055,
      "learning_rate": 8.873561024898668e-06,
      "loss": -0.0014,
      "num_tokens": 96885900.0,
      "reward": 0.4165668785572052,
      "reward_std": 0.05786097049713135,
      "rewards/gemini_judge_reward_func/mean": 0.1071428582072258,
      "rewards/gemini_judge_reward_func/std": 0.2136441320180893,
      "rewards/semantic_correctness_reward_func/mean": 0.40279844403266907,
      "rewards/semantic_correctness_reward_func/std": 0.18588510155677795,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 272
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0357142857142857,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 725.0,
      "completions/mean_length": 190.33482360839844,
      "completions/mean_terminated_length": 159.4583282470703,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.09320985019845504,
      "grad_norm": 0.01978684961795807,
      "kl": 0.016774654388427734,
      "learning_rate": 8.863084742426719e-06,
      "loss": -0.0157,
      "num_tokens": 97254931.0,
      "reward": 0.4032873511314392,
      "reward_std": 0.05024518445134163,
      "rewards/gemini_judge_reward_func/mean": 0.1261160671710968,
      "rewards/gemini_judge_reward_func/std": 0.2593517303466797,
      "rewards/semantic_correctness_reward_func/mean": 0.4146420657634735,
      "rewards/semantic_correctness_reward_func/std": 0.1971837282180786,
      "rewards/xmlcount_reward_func/mean": 0.6747812628746033,
      "rewards/xmlcount_reward_func/std": 0.4702270030975342,
      "step": 273
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 807.0,
      "completions/mean_length": 184.3928680419922,
      "completions/mean_terminated_length": 161.28439331054688,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.09355127822116001,
      "grad_norm": 0.02010081149637699,
      "kl": 0.017665863037109375,
      "learning_rate": 8.852566213878947e-06,
      "loss": -0.0007,
      "num_tokens": 97643595.0,
      "reward": 0.39674344658851624,
      "reward_std": 0.074442058801651,
      "rewards/gemini_judge_reward_func/mean": 0.1517857164144516,
      "rewards/gemini_judge_reward_func/std": 0.27581340074539185,
      "rewards/semantic_correctness_reward_func/mean": 0.4288957417011261,
      "rewards/semantic_correctness_reward_func/std": 0.21292337775230408,
      "rewards/xmlcount_reward_func/mean": 0.6256250739097595,
      "rewards/xmlcount_reward_func/std": 0.48569241166114807,
      "step": 274
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 945.0,
      "completions/mean_length": 173.38394165039062,
      "completions/mean_terminated_length": 161.8371124267578,
      "completions/min_length": 42.0,
      "completions/min_terminated_length": 42.0,
      "epoch": 0.09389270624386496,
      "grad_norm": 0.020230090245604515,
      "kl": 0.012257099151611328,
      "learning_rate": 8.842005554284296e-06,
      "loss": 0.0054,
      "num_tokens": 97985765.0,
      "reward": 0.44697558879852295,
      "reward_std": 0.05089180916547775,
      "rewards/gemini_judge_reward_func/mean": 0.1037946417927742,
      "rewards/gemini_judge_reward_func/std": 0.19835957884788513,
      "rewards/semantic_correctness_reward_func/mean": 0.4185386300086975,
      "rewards/semantic_correctness_reward_func/std": 0.20165562629699707,
      "rewards/xmlcount_reward_func/mean": 0.8043750524520874,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 383.0,
      "completions/max_terminated_length": 383.0,
      "completions/mean_length": 147.79464721679688,
      "completions/mean_terminated_length": 147.79464721679688,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.09423413426656993,
      "grad_norm": 0.02072894014418125,
      "kl": 0.022124528884887695,
      "learning_rate": 8.831402879132447e-06,
      "loss": -0.0036,
      "num_tokens": 98334511.0,
      "reward": 0.4387449026107788,
      "reward_std": 0.06641606986522675,
      "rewards/gemini_judge_reward_func/mean": 0.1350446492433548,
      "rewards/gemini_judge_reward_func/std": 0.25258663296699524,
      "rewards/semantic_correctness_reward_func/mean": 0.42213496565818787,
      "rewards/semantic_correctness_reward_func/std": 0.22363466024398804,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 276
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 680.0,
      "completions/mean_length": 173.51339721679688,
      "completions/mean_terminated_length": 158.04998779296875,
      "completions/min_length": 46.0,
      "completions/min_terminated_length": 46.0,
      "epoch": 0.0945755622892749,
      "grad_norm": 0.019913461059331894,
      "kl": 0.012322187423706055,
      "learning_rate": 8.820758304372557e-06,
      "loss": -0.0162,
      "num_tokens": 98659610.0,
      "reward": 0.4966113567352295,
      "reward_std": 0.062031567096710205,
      "rewards/gemini_judge_reward_func/mean": 0.1506696492433548,
      "rewards/gemini_judge_reward_func/std": 0.25807496905326843,
      "rewards/semantic_correctness_reward_func/mean": 0.46571722626686096,
      "rewards/semantic_correctness_reward_func/std": 0.22223028540611267,
      "rewards/xmlcount_reward_func/mean": 0.8580000996589661,
      "rewards/xmlcount_reward_func/std": 0.35106155276298523,
      "step": 277
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0357142857142857,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 789.0,
      "completions/mean_length": 184.04464721679688,
      "completions/mean_terminated_length": 152.9351806640625,
      "completions/min_length": 51.0,
      "completions/min_terminated_length": 51.0,
      "epoch": 0.09491699031197985,
      "grad_norm": 0.01981070265173912,
      "kl": 0.014884471893310547,
      "learning_rate": 8.810071946411989e-06,
      "loss": -0.0066,
      "num_tokens": 99051748.0,
      "reward": 0.3772115409374237,
      "reward_std": 0.07925941050052643,
      "rewards/gemini_judge_reward_func/mean": 0.1272321492433548,
      "rewards/gemini_judge_reward_func/std": 0.2614969313144684,
      "rewards/semantic_correctness_reward_func/mean": 0.43397727608680725,
      "rewards/semantic_correctness_reward_func/std": 0.20137180387973785,
      "rewards/xmlcount_reward_func/mean": 0.5988079905509949,
      "rewards/xmlcount_reward_func/std": 0.4918448030948639,
      "step": 278
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 911.0,
      "completions/mean_length": 178.5357208251953,
      "completions/mean_terminated_length": 163.16363525390625,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.09525841833468482,
      "grad_norm": 0.020953577011823654,
      "kl": 0.015062332153320312,
      "learning_rate": 8.799343922115045e-06,
      "loss": -0.0112,
      "num_tokens": 99401472.0,
      "reward": 0.44807708263397217,
      "reward_std": 0.06729375571012497,
      "rewards/gemini_judge_reward_func/mean": 0.1417410671710968,
      "rewards/gemini_judge_reward_func/std": 0.23616158962249756,
      "rewards/semantic_correctness_reward_func/mean": 0.4419924318790436,
      "rewards/semantic_correctness_reward_func/std": 0.1812608540058136,
      "rewards/xmlcount_reward_func/mean": 0.7574554085731506,
      "rewards/xmlcount_reward_func/std": 0.4265342950820923,
      "step": 279
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 651.0,
      "completions/mean_length": 179.24107360839844,
      "completions/mean_terminated_length": 155.99081420898438,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.09559984635738979,
      "grad_norm": 0.01885247975587845,
      "kl": 0.014855623245239258,
      "learning_rate": 8.788574348801676e-06,
      "loss": -0.0195,
      "num_tokens": 99751306.0,
      "reward": 0.4536531865596771,
      "reward_std": 0.06807014346122742,
      "rewards/gemini_judge_reward_func/mean": 0.140625,
      "rewards/gemini_judge_reward_func/std": 0.24781839549541473,
      "rewards/semantic_correctness_reward_func/mean": 0.43636396527290344,
      "rewards/semantic_correctness_reward_func/std": 0.21698756515979767,
      "rewards/xmlcount_reward_func/mean": 0.7753258943557739,
      "rewards/xmlcount_reward_func/std": 0.4178903102874756,
      "step": 280
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 946.0,
      "completions/mean_length": 180.2366180419922,
      "completions/mean_terminated_length": 157.0137481689453,
      "completions/min_length": 49.0,
      "completions/min_terminated_length": 49.0,
      "epoch": 0.09594127438009474,
      "grad_norm": 0.01885247975587845,
      "kl": 0.014781713485717773,
      "learning_rate": 8.788574348801676e-06,
      "loss": -0.0174,
      "num_tokens": 100103247.0,
      "reward": 0.43649476766586304,
      "reward_std": 0.06695149838924408,
      "rewards/gemini_judge_reward_func/mean": 0.1395089328289032,
      "rewards/gemini_judge_reward_func/std": 0.28373584151268005,
      "rewards/semantic_correctness_reward_func/mean": 0.43770572543144226,
      "rewards/semantic_correctness_reward_func/std": 0.22858086228370667,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 281
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 790.0,
      "completions/mean_length": 165.84375,
      "completions/mean_terminated_length": 158.11260986328125,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.09628270240279971,
      "grad_norm": 0.019319765269756317,
      "kl": 0.013626575469970703,
      "learning_rate": 8.777763344246209e-06,
      "loss": -0.0305,
      "num_tokens": 100475300.0,
      "reward": 0.3688974678516388,
      "reward_std": 0.06074400618672371,
      "rewards/gemini_judge_reward_func/mean": 0.1082589253783226,
      "rewards/gemini_judge_reward_func/std": 0.20158810913562775,
      "rewards/semantic_correctness_reward_func/mean": 0.4124692380428314,
      "rewards/semantic_correctness_reward_func/std": 0.19219088554382324,
      "rewards/xmlcount_reward_func/mean": 0.6077499985694885,
      "rewards/xmlcount_reward_func/std": 0.48996883630752563,
      "step": 282
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 888.0,
      "completions/mean_length": 172.38394165039062,
      "completions/mean_terminated_length": 156.89999389648438,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.09662413042550468,
      "grad_norm": 0.020828846842050552,
      "kl": 0.018453598022460938,
      "learning_rate": 8.766911026676063e-06,
      "loss": -0.0088,
      "num_tokens": 100845754.0,
      "reward": 0.44821447134017944,
      "reward_std": 0.07275271415710449,
      "rewards/gemini_judge_reward_func/mean": 0.1763392835855484,
      "rewards/gemini_judge_reward_func/std": 0.2763844132423401,
      "rewards/semantic_correctness_reward_func/mean": 0.45839372277259827,
      "rewards/semantic_correctness_reward_func/std": 0.2118861824274063,
      "rewards/xmlcount_reward_func/mean": 0.7150000929832458,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 283
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 934.0,
      "completions/mean_length": 176.25894165039062,
      "completions/mean_terminated_length": 156.9040985107422,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.09696555844820964,
      "grad_norm": 0.021423369646072388,
      "kl": 0.01433420181274414,
      "learning_rate": 8.756017514770444e-06,
      "loss": -0.0316,
      "num_tokens": 101213308.0,
      "reward": 0.3927696943283081,
      "reward_std": 0.05550656095147133,
      "rewards/gemini_judge_reward_func/mean": 0.0803571417927742,
      "rewards/gemini_judge_reward_func/std": 0.1809735894203186,
      "rewards/semantic_correctness_reward_func/mean": 0.3821161687374115,
      "rewards/semantic_correctness_reward_func/std": 0.18534725904464722,
      "rewards/xmlcount_reward_func/mean": 0.7105089426040649,
      "rewards/xmlcount_reward_func/std": 0.45530179142951965,
      "step": 284
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1018.0,
      "completions/mean_length": 167.0357208251953,
      "completions/mean_terminated_length": 159.31532287597656,
      "completions/min_length": 59.0,
      "completions/min_terminated_length": 59.0,
      "epoch": 0.0973069864709146,
      "grad_norm": 0.01976688578724861,
      "kl": 0.013885021209716797,
      "learning_rate": 8.745082927659048e-06,
      "loss": -0.0169,
      "num_tokens": 101551900.0,
      "reward": 0.459250271320343,
      "reward_std": 0.07970133423805237,
      "rewards/gemini_judge_reward_func/mean": 0.1361607164144516,
      "rewards/gemini_judge_reward_func/std": 0.26340386271476746,
      "rewards/semantic_correctness_reward_func/mean": 0.4509297311306,
      "rewards/semantic_correctness_reward_func/std": 0.2095455825328827,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 285
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 293.0,
      "completions/mean_length": 168.63839721679688,
      "completions/mean_terminated_length": 145.09632873535156,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.09764841449361956,
      "grad_norm": 0.024606870487332344,
      "kl": 0.015841007232666016,
      "learning_rate": 8.734107384920771e-06,
      "loss": -0.018,
      "num_tokens": 101920447.0,
      "reward": 0.4331532418727875,
      "reward_std": 0.0668218731880188,
      "rewards/gemini_judge_reward_func/mean": 0.1272321492433548,
      "rewards/gemini_judge_reward_func/std": 0.24259328842163086,
      "rewards/semantic_correctness_reward_func/mean": 0.4455518424510956,
      "rewards/semantic_correctness_reward_func/std": 0.21146556735038757,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 286
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 320.0,
      "completions/max_terminated_length": 320.0,
      "completions/mean_length": 149.97769165039062,
      "completions/mean_terminated_length": 149.97769165039062,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.09798984251632453,
      "grad_norm": 0.02012326754629612,
      "kl": 0.014970779418945312,
      "learning_rate": 8.72309100658239e-06,
      "loss": -0.0045,
      "num_tokens": 102259354.0,
      "reward": 0.4638066589832306,
      "reward_std": 0.06380105763673782,
      "rewards/gemini_judge_reward_func/mean": 0.1350446492433548,
      "rewards/gemini_judge_reward_func/std": 0.2305423468351364,
      "rewards/semantic_correctness_reward_func/mean": 0.4402652382850647,
      "rewards/semantic_correctness_reward_func/std": 0.22478193044662476,
      "rewards/xmlcount_reward_func/mean": 0.8043392896652222,
      "rewards/xmlcount_reward_func/std": 0.3985668420791626,
      "step": 287
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 838.0,
      "completions/mean_length": 181.07144165039062,
      "completions/mean_terminated_length": 161.82647705078125,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.09833127053902949,
      "grad_norm": 0.02045246586203575,
      "kl": 0.014268636703491211,
      "learning_rate": 8.71203391311725e-06,
      "loss": -0.0148,
      "num_tokens": 102623894.0,
      "reward": 0.42740973830223083,
      "reward_std": 0.06286334991455078,
      "rewards/gemini_judge_reward_func/mean": 0.1049107164144516,
      "rewards/gemini_judge_reward_func/std": 0.2199123203754425,
      "rewards/semantic_correctness_reward_func/mean": 0.4346645176410675,
      "rewards/semantic_correctness_reward_func/std": 0.19024407863616943,
      "rewards/xmlcount_reward_func/mean": 0.7462812662124634,
      "rewards/xmlcount_reward_func/std": 0.4369716942310333,
      "step": 288
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 554.0,
      "completions/mean_length": 165.91519165039062,
      "completions/mean_terminated_length": 158.1846923828125,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.09867269856173445,
      "grad_norm": 0.02105833776295185,
      "kl": 0.012676715850830078,
      "learning_rate": 8.700936225443958e-06,
      "loss": -0.0097,
      "num_tokens": 102977455.0,
      "reward": 0.4517498314380646,
      "reward_std": 0.05953366681933403,
      "rewards/gemini_judge_reward_func/mean": 0.1082589253783226,
      "rewards/gemini_judge_reward_func/std": 0.22770272195339203,
      "rewards/semantic_correctness_reward_func/mean": 0.4334811270236969,
      "rewards/semantic_correctness_reward_func/std": 0.2211431860923767,
      "rewards/xmlcount_reward_func/mean": 0.8043750524520874,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 289
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 169.62054443359375,
      "completions/mean_terminated_length": 154.08636474609375,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.09901412658443942,
      "grad_norm": 0.021059256047010422,
      "kl": 0.017063140869140625,
      "learning_rate": 8.689798064925049e-06,
      "loss": -0.0157,
      "num_tokens": 103329290.0,
      "reward": 0.42608827352523804,
      "reward_std": 0.0673830509185791,
      "rewards/gemini_judge_reward_func/mean": 0.0892857164144516,
      "rewards/gemini_judge_reward_func/std": 0.21532420814037323,
      "rewards/semantic_correctness_reward_func/mean": 0.44812875986099243,
      "rewards/semantic_correctness_reward_func/std": 0.20307044684886932,
      "rewards/xmlcount_reward_func/mean": 0.7518705725669861,
      "rewards/xmlcount_reward_func/std": 0.4314948618412018,
      "step": 290
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 406.0,
      "completions/mean_length": 157.41964721679688,
      "completions/mean_terminated_length": 149.61260986328125,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.09935555460714438,
      "grad_norm": 0.02095315419137478,
      "kl": 0.016399383544921875,
      "learning_rate": 8.67861955336566e-06,
      "loss": -0.0011,
      "num_tokens": 103694592.0,
      "reward": 0.44059500098228455,
      "reward_std": 0.07820717245340347,
      "rewards/gemini_judge_reward_func/mean": 0.1283482164144516,
      "rewards/gemini_judge_reward_func/std": 0.25276488065719604,
      "rewards/semantic_correctness_reward_func/mean": 0.40902838110923767,
      "rewards/semantic_correctness_reward_func/std": 0.2408311814069748,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 291
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 833.0,
      "completions/mean_length": 176.21429443359375,
      "completions/mean_terminated_length": 160.79998779296875,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.09969698262984934,
      "grad_norm": 0.019932152703404427,
      "kl": 0.012141227722167969,
      "learning_rate": 8.6674008130122e-06,
      "loss": -0.0288,
      "num_tokens": 104034788.0,
      "reward": 0.46020758152008057,
      "reward_std": 0.06737792491912842,
      "rewards/gemini_judge_reward_func/mean": 0.1629464328289032,
      "rewards/gemini_judge_reward_func/std": 0.27347174286842346,
      "rewards/semantic_correctness_reward_func/mean": 0.47364482283592224,
      "rewards/semantic_correctness_reward_func/std": 0.21396541595458984,
      "rewards/xmlcount_reward_func/mean": 0.7507500648498535,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 292
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 367.0,
      "completions/mean_length": 156.05357360839844,
      "completions/mean_terminated_length": 148.23423767089844,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.10003841065255431,
      "grad_norm": 0.021158162504434586,
      "kl": 0.016646862030029297,
      "learning_rate": 8.65614196655102e-06,
      "loss": 0.0125,
      "num_tokens": 104408036.0,
      "reward": 0.3966463506221771,
      "reward_std": 0.0536530539393425,
      "rewards/gemini_judge_reward_func/mean": 0.0993303582072258,
      "rewards/gemini_judge_reward_func/std": 0.20203447341918945,
      "rewards/semantic_correctness_reward_func/mean": 0.43345481157302856,
      "rewards/semantic_correctness_reward_func/std": 0.19227315485477448,
      "rewards/xmlcount_reward_func/mean": 0.6755580902099609,
      "rewards/xmlcount_reward_func/std": 0.4676108658313751,
      "step": 293
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 877.0,
      "completions/max_terminated_length": 877.0,
      "completions/mean_length": 157.34375,
      "completions/mean_terminated_length": 157.34375,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.10037983867525928,
      "grad_norm": 0.021672172471880913,
      "kl": 0.013187885284423828,
      "learning_rate": 8.644843137107058e-06,
      "loss": -0.0071,
      "num_tokens": 104742949.0,
      "reward": 0.48235565423965454,
      "reward_std": 0.07993865013122559,
      "rewards/gemini_judge_reward_func/mean": 0.1506696492433548,
      "rewards/gemini_judge_reward_func/std": 0.25258663296699524,
      "rewards/semantic_correctness_reward_func/mean": 0.4301888048648834,
      "rewards/semantic_correctness_reward_func/std": 0.23608912527561188,
      "rewards/xmlcount_reward_func/mean": 0.8401250839233398,
      "rewards/xmlcount_reward_func/std": 0.3684578537940979,
      "step": 294
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 670.0,
      "completions/mean_length": 160.49554443359375,
      "completions/mean_terminated_length": 148.7737579345703,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.10072126669796423,
      "grad_norm": 0.019521746784448624,
      "kl": 0.015337467193603516,
      "learning_rate": 8.633504448242504e-06,
      "loss": -0.0161,
      "num_tokens": 105137996.0,
      "reward": 0.4107567369937897,
      "reward_std": 0.07504715025424957,
      "rewards/gemini_judge_reward_func/mean": 0.1495535671710968,
      "rewards/gemini_judge_reward_func/std": 0.2677757143974304,
      "rewards/semantic_correctness_reward_func/mean": 0.4689352810382843,
      "rewards/semantic_correctness_reward_func/std": 0.22458945214748383,
      "rewards/xmlcount_reward_func/mean": 0.6428705453872681,
      "rewards/xmlcount_reward_func/std": 0.47865068912506104,
      "step": 295
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 552.0,
      "completions/mean_length": 157.625,
      "completions/mean_terminated_length": 153.7399139404297,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.1010626947206692,
      "grad_norm": 0.0203217975795269,
      "kl": 0.014668941497802734,
      "learning_rate": 8.622126023955446e-06,
      "loss": 0.0034,
      "num_tokens": 105495484.0,
      "reward": 0.45263081789016724,
      "reward_std": 0.07190153002738953,
      "rewards/gemini_judge_reward_func/mean": 0.1183035746216774,
      "rewards/gemini_judge_reward_func/std": 0.2306644320487976,
      "rewards/semantic_correctness_reward_func/mean": 0.4177968204021454,
      "rewards/semantic_correctness_reward_func/std": 0.22075255215168,
      "rewards/xmlcount_reward_func/mean": 0.8043751120567322,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 296
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 647.0,
      "completions/mean_length": 174.09376525878906,
      "completions/mean_terminated_length": 150.7018280029297,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.10140412274337417,
      "grad_norm": 0.021145980805158615,
      "kl": 0.015403032302856445,
      "learning_rate": 8.610707988678504e-06,
      "loss": -0.0054,
      "num_tokens": 105849873.0,
      "reward": 0.42827874422073364,
      "reward_std": 0.07740958034992218,
      "rewards/gemini_judge_reward_func/mean": 0.1328125,
      "rewards/gemini_judge_reward_func/std": 0.2436303198337555,
      "rewards/semantic_correctness_reward_func/mean": 0.4412953555583954,
      "rewards/semantic_correctness_reward_func/std": 0.19017373025417328,
      "rewards/xmlcount_reward_func/mean": 0.7172366380691528,
      "rewards/xmlcount_reward_func/std": 0.4509044885635376,
      "step": 297
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 582.0,
      "completions/mean_length": 157.50894165039062,
      "completions/mean_terminated_length": 149.7027130126953,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.10174555076607912,
      "grad_norm": 0.020420530810952187,
      "kl": 0.01408076286315918,
      "learning_rate": 8.599250467277483e-06,
      "loss": -0.0364,
      "num_tokens": 106188755.0,
      "reward": 0.4316641092300415,
      "reward_std": 0.05426663160324097,
      "rewards/gemini_judge_reward_func/mean": 0.1227678582072258,
      "rewards/gemini_judge_reward_func/std": 0.2270781695842743,
      "rewards/semantic_correctness_reward_func/mean": 0.4112846255302429,
      "rewards/semantic_correctness_reward_func/std": 0.19838160276412964,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 298
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1006.0,
      "completions/mean_length": 172.80357360839844,
      "completions/mean_terminated_length": 153.36985778808594,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.10208697878878409,
      "grad_norm": 0.020977923646569252,
      "kl": 0.014760017395019531,
      "learning_rate": 8.587753585050004e-06,
      "loss": -0.0314,
      "num_tokens": 106526319.0,
      "reward": 0.4237655997276306,
      "reward_std": 0.07243627309799194,
      "rewards/gemini_judge_reward_func/mean": 0.1283482164144516,
      "rewards/gemini_judge_reward_func/std": 0.264680951833725,
      "rewards/semantic_correctness_reward_func/mean": 0.3919081389904022,
      "rewards/semantic_correctness_reward_func/std": 0.21118712425231934,
      "rewards/xmlcount_reward_func/mean": 0.735111653804779,
      "rewards/xmlcount_reward_func/std": 0.4418267011642456,
      "step": 299
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 573.0,
      "completions/mean_length": 157.35269165039062,
      "completions/mean_terminated_length": 153.46636962890625,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "epoch": 0.10242840681148906,
      "grad_norm": 0.01886204071342945,
      "kl": 0.012836217880249023,
      "learning_rate": 8.576217467724129e-06,
      "loss": -0.0136,
      "num_tokens": 106879998.0,
      "reward": 0.4238353371620178,
      "reward_std": 0.06256872415542603,
      "rewards/gemini_judge_reward_func/mean": 0.1294642835855484,
      "rewards/gemini_judge_reward_func/std": 0.2527475655078888,
      "rewards/semantic_correctness_reward_func/mean": 0.43024787306785583,
      "rewards/semantic_correctness_reward_func/std": 0.22388145327568054,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 300
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 411.0,
      "completions/mean_length": 166.91964721679688,
      "completions/mean_terminated_length": 155.2850799560547,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.10276983483419401,
      "grad_norm": 0.021645022556185722,
      "kl": 0.013919830322265625,
      "learning_rate": 8.564642241456986e-06,
      "loss": -0.0271,
      "num_tokens": 107232332.0,
      "reward": 0.47711122035980225,
      "reward_std": 0.07099025696516037,
      "rewards/gemini_judge_reward_func/mean": 0.1584821492433548,
      "rewards/gemini_judge_reward_func/std": 0.30040243268013,
      "rewards/semantic_correctness_reward_func/mean": 0.4598415791988373,
      "rewards/semantic_correctness_reward_func/std": 0.22269363701343536,
      "rewards/xmlcount_reward_func/mean": 0.8043751120567322,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 301
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0357142857142857,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 909.0,
      "completions/mean_length": 178.12501525878906,
      "completions/mean_terminated_length": 146.79629516601562,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.10311126285689898,
      "grad_norm": 0.02136322483420372,
      "kl": 0.013774394989013672,
      "learning_rate": 8.553028032833397e-06,
      "loss": 0.0098,
      "num_tokens": 107599752.0,
      "reward": 0.4435153007507324,
      "reward_std": 0.07765571027994156,
      "rewards/gemini_judge_reward_func/mean": 0.1741071492433548,
      "rewards/gemini_judge_reward_func/std": 0.2954044044017792,
      "rewards/semantic_correctness_reward_func/mean": 0.45273685455322266,
      "rewards/semantic_correctness_reward_func/std": 0.21585151553153992,
      "rewards/xmlcount_reward_func/mean": 0.7083125114440918,
      "rewards/xmlcount_reward_func/std": 0.45263010263442993,
      "step": 302
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 465.0,
      "completions/mean_length": 154.9375,
      "completions/mean_terminated_length": 147.1081085205078,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.10345269087960395,
      "grad_norm": 0.020963052287697792,
      "kl": 0.012014389038085938,
      "learning_rate": 8.541374968864486e-06,
      "loss": -0.0102,
      "num_tokens": 107968010.0,
      "reward": 0.42616450786590576,
      "reward_std": 0.05278439447283745,
      "rewards/gemini_judge_reward_func/mean": 0.109375,
      "rewards/gemini_judge_reward_func/std": 0.22280631959438324,
      "rewards/semantic_correctness_reward_func/mean": 0.4488045871257782,
      "rewards/semantic_correctness_reward_func/std": 0.1997697800397873,
      "rewards/xmlcount_reward_func/mean": 0.7316340208053589,
      "rewards/xmlcount_reward_func/std": 0.442789226770401,
      "step": 303
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 854.0,
      "completions/mean_length": 154.2991180419922,
      "completions/mean_terminated_length": 142.49322509765625,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "epoch": 0.10379411890230891,
      "grad_norm": 0.02215876430273056,
      "kl": 0.01742839813232422,
      "learning_rate": 8.529683176986295e-06,
      "loss": -0.0091,
      "num_tokens": 108318421.0,
      "reward": 0.4235052466392517,
      "reward_std": 0.06170068308711052,
      "rewards/gemini_judge_reward_func/mean": 0.1261160671710968,
      "rewards/gemini_judge_reward_func/std": 0.22584888339042664,
      "rewards/semantic_correctness_reward_func/mean": 0.4352940022945404,
      "rewards/semantic_correctness_reward_func/std": 0.19691424071788788,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 304
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 449.0,
      "completions/mean_length": 164.59375,
      "completions/mean_terminated_length": 148.96817016601562,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.10413554692501387,
      "grad_norm": 0.0207134447991848,
      "kl": 0.012769222259521484,
      "learning_rate": 8.517952785058385e-06,
      "loss": -0.0041,
      "num_tokens": 108649042.0,
      "reward": 0.45065411925315857,
      "reward_std": 0.0538845956325531,
      "rewards/gemini_judge_reward_func/mean": 0.1026785746216774,
      "rewards/gemini_judge_reward_func/std": 0.19394874572753906,
      "rewards/semantic_correctness_reward_func/mean": 0.4034132957458496,
      "rewards/semantic_correctness_reward_func/std": 0.21382492780685425,
      "rewards/xmlcount_reward_func/mean": 0.8222500085830688,
      "rewards/xmlcount_reward_func/std": 0.3842346966266632,
      "step": 305
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 727.0,
      "completions/mean_length": 166.10714721679688,
      "completions/mean_terminated_length": 146.52053833007812,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.10447697494771883,
      "grad_norm": 0.020172296091914177,
      "kl": 0.014577388763427734,
      "learning_rate": 8.506183921362443e-06,
      "loss": -0.0415,
      "num_tokens": 109025054.0,
      "reward": 0.4207233786582947,
      "reward_std": 0.07463856041431427,
      "rewards/gemini_judge_reward_func/mean": 0.1629464328289032,
      "rewards/gemini_judge_reward_func/std": 0.2765292227268219,
      "rewards/semantic_correctness_reward_func/mean": 0.44155433773994446,
      "rewards/semantic_correctness_reward_func/std": 0.21850642561912537,
      "rewards/xmlcount_reward_func/mean": 0.6680848002433777,
      "rewards/xmlcount_reward_func/std": 0.4690874218940735,
      "step": 306
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 356.0,
      "completions/mean_length": 154.96875,
      "completions/mean_terminated_length": 143.1719512939453,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.1048184029704238,
      "grad_norm": 0.022241463884711266,
      "kl": 0.01328420639038086,
      "learning_rate": 8.494376714600878e-06,
      "loss": 0.0014,
      "num_tokens": 109369683.0,
      "reward": 0.4572017788887024,
      "reward_std": 0.06673526763916016,
      "rewards/gemini_judge_reward_func/mean": 0.1350446492433548,
      "rewards/gemini_judge_reward_func/std": 0.22809800505638123,
      "rewards/semantic_correctness_reward_func/mean": 0.42502665519714355,
      "rewards/semantic_correctness_reward_func/std": 0.22096006572246552,
      "rewards/xmlcount_reward_func/mean": 0.795446515083313,
      "rewards/xmlcount_reward_func/std": 0.3996833562850952,
      "step": 307
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 787.0,
      "completions/mean_length": 176.94644165039062,
      "completions/mean_terminated_length": 149.6221160888672,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.10515983099312876,
      "grad_norm": 0.02019048109650612,
      "kl": 0.012269735336303711,
      "learning_rate": 8.482531293895412e-06,
      "loss": -0.0139,
      "num_tokens": 109722579.0,
      "reward": 0.4193563163280487,
      "reward_std": 0.05619501322507858,
      "rewards/gemini_judge_reward_func/mean": 0.1439732164144516,
      "rewards/gemini_judge_reward_func/std": 0.24185720086097717,
      "rewards/semantic_correctness_reward_func/mean": 0.4145851731300354,
      "rewards/semantic_correctness_reward_func/std": 0.21506093442440033,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903594970703,
      "step": 308
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 893.0,
      "completions/mean_length": 168.7366180419922,
      "completions/mean_terminated_length": 157.126708984375,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.10550125901583372,
      "grad_norm": 0.022851206362247467,
      "kl": 0.013672828674316406,
      "learning_rate": 8.470647788785665e-06,
      "loss": -0.0349,
      "num_tokens": 110095080.0,
      "reward": 0.4072951674461365,
      "reward_std": 0.06596186012029648,
      "rewards/gemini_judge_reward_func/mean": 0.125,
      "rewards/gemini_judge_reward_func/std": 0.24144557118415833,
      "rewards/semantic_correctness_reward_func/mean": 0.4279758036136627,
      "rewards/semantic_correctness_reward_func/std": 0.21440142393112183,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 309
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 147.3125,
      "completions/mean_terminated_length": 143.3811798095703,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.10584268703853869,
      "grad_norm": 0.023006869480013847,
      "kl": 0.017367839813232422,
      "learning_rate": 8.458726329227748e-06,
      "loss": -0.0159,
      "num_tokens": 110468182.0,
      "reward": 0.413924902677536,
      "reward_std": 0.0590013712644577,
      "rewards/gemini_judge_reward_func/mean": 0.1138392835855484,
      "rewards/gemini_judge_reward_func/std": 0.2491879016160965,
      "rewards/semantic_correctness_reward_func/mean": 0.4119459092617035,
      "rewards/semantic_correctness_reward_func/std": 0.22772468626499176,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 310
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 560.0,
      "completions/mean_length": 159.52232360839844,
      "completions/mean_terminated_length": 147.78733825683594,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.10618411506124366,
      "grad_norm": 0.02124634012579918,
      "kl": 0.014283180236816406,
      "learning_rate": 8.446767045592829e-06,
      "loss": -0.0032,
      "num_tokens": 110842627.0,
      "reward": 0.40707066655158997,
      "reward_std": 0.058956243097782135,
      "rewards/gemini_judge_reward_func/mean": 0.1049107164144516,
      "rewards/gemini_judge_reward_func/std": 0.20404648780822754,
      "rewards/semantic_correctness_reward_func/mean": 0.39553165435791016,
      "rewards/semantic_correctness_reward_func/std": 0.20496897399425507,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 311
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 419.0,
      "completions/mean_length": 144.42857360839844,
      "completions/mean_terminated_length": 136.50450134277344,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.10652554308394861,
      "grad_norm": 0.023235971108078957,
      "kl": 0.017826557159423828,
      "learning_rate": 8.434770068665723e-06,
      "loss": -0.02,
      "num_tokens": 111209299.0,
      "reward": 0.39658233523368835,
      "reward_std": 0.0629458948969841,
      "rewards/gemini_judge_reward_func/mean": 0.1640625,
      "rewards/gemini_judge_reward_func/std": 0.2691808044910431,
      "rewards/semantic_correctness_reward_func/mean": 0.4392865300178528,
      "rewards/semantic_correctness_reward_func/std": 0.20642463862895966,
      "rewards/xmlcount_reward_func/mean": 0.6077500581741333,
      "rewards/xmlcount_reward_func/std": 0.48996880650520325,
      "step": 312
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 513.0,
      "completions/mean_length": 174.95982360839844,
      "completions/mean_terminated_length": 147.57142639160156,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.10686697110665358,
      "grad_norm": 0.025226525962352753,
      "kl": 0.013714790344238281,
      "learning_rate": 8.422735529643445e-06,
      "loss": -0.0039,
      "num_tokens": 111573746.0,
      "reward": 0.4315214157104492,
      "reward_std": 0.0645042136311531,
      "rewards/gemini_judge_reward_func/mean": 0.1026785746216774,
      "rewards/gemini_judge_reward_func/std": 0.22845152020454407,
      "rewards/semantic_correctness_reward_func/mean": 0.41499972343444824,
      "rewards/semantic_correctness_reward_func/std": 0.20047926902770996,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 313
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 845.0,
      "completions/mean_length": 155.8482208251953,
      "completions/mean_terminated_length": 144.0633544921875,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.10720839912935855,
      "grad_norm": 0.020772015675902367,
      "kl": 0.014401912689208984,
      "learning_rate": 8.410663560133784e-06,
      "loss": -0.0003,
      "num_tokens": 111937032.0,
      "reward": 0.4510946273803711,
      "reward_std": 0.06376608461141586,
      "rewards/gemini_judge_reward_func/mean": 0.1540178507566452,
      "rewards/gemini_judge_reward_func/std": 0.25662854313850403,
      "rewards/semantic_correctness_reward_func/mean": 0.4459371864795685,
      "rewards/semantic_correctness_reward_func/std": 0.2272828072309494,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 314
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 160.9732208251953,
      "completions/mean_terminated_length": 141.26939392089844,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.1075498271520635,
      "grad_norm": 0.02254970371723175,
      "kl": 0.017621517181396484,
      "learning_rate": 8.398554292153866e-06,
      "loss": 0.0044,
      "num_tokens": 112320054.0,
      "reward": 0.4214918315410614,
      "reward_std": 0.05463023856282234,
      "rewards/gemini_judge_reward_func/mean": 0.1339285671710968,
      "rewards/gemini_judge_reward_func/std": 0.2570280432701111,
      "rewards/semantic_correctness_reward_func/mean": 0.4453520178794861,
      "rewards/semantic_correctness_reward_func/std": 0.21047256886959076,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903892993927,
      "step": 315
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 581.0,
      "completions/mean_length": 147.58482360839844,
      "completions/mean_terminated_length": 139.68919372558594,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.10789125517476847,
      "grad_norm": 0.023806257173419,
      "kl": 0.014993429183959961,
      "learning_rate": 8.386407858128707e-06,
      "loss": 0.0018,
      "num_tokens": 112692217.0,
      "reward": 0.38690313696861267,
      "reward_std": 0.06407325714826584,
      "rewards/gemini_judge_reward_func/mean": 0.09375,
      "rewards/gemini_judge_reward_func/std": 0.19843840599060059,
      "rewards/semantic_correctness_reward_func/mean": 0.41531017422676086,
      "rewards/semantic_correctness_reward_func/std": 0.19655534625053406,
      "rewards/xmlcount_reward_func/mean": 0.6658526659011841,
      "rewards/xmlcount_reward_func/std": 0.4686855971813202,
      "step": 316
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 591.0,
      "completions/mean_length": 170.46875,
      "completions/mean_terminated_length": 142.93548583984375,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.10823268319747344,
      "grad_norm": 0.02215094119310379,
      "kl": 0.016452312469482422,
      "learning_rate": 8.37422439088976e-06,
      "loss": -0.0031,
      "num_tokens": 113080238.0,
      "reward": 0.4259691536426544,
      "reward_std": 0.0712801143527031,
      "rewards/gemini_judge_reward_func/mean": 0.1640625,
      "rewards/gemini_judge_reward_func/std": 0.2541865110397339,
      "rewards/semantic_correctness_reward_func/mean": 0.4343097507953644,
      "rewards/semantic_correctness_reward_func/std": 0.22840382158756256,
      "rewards/xmlcount_reward_func/mean": 0.6837054491043091,
      "rewards/xmlcount_reward_func/std": 0.46439453959465027,
      "step": 317
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 775.0,
      "completions/mean_length": 166.27679443359375,
      "completions/mean_terminated_length": 142.6697235107422,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.10857411122017839,
      "grad_norm": 0.021131092682480812,
      "kl": 0.017343997955322266,
      "learning_rate": 8.362004023673473e-06,
      "loss": -0.0116,
      "num_tokens": 113452164.0,
      "reward": 0.3965999484062195,
      "reward_std": 0.06723373383283615,
      "rewards/gemini_judge_reward_func/mean": 0.1283482164144516,
      "rewards/gemini_judge_reward_func/std": 0.24715863168239594,
      "rewards/semantic_correctness_reward_func/mean": 0.40355318784713745,
      "rewards/semantic_correctness_reward_func/std": 0.1819203644990921,
      "rewards/xmlcount_reward_func/mean": 0.6613750457763672,
      "rewards/xmlcount_reward_func/std": 0.47500187158584595,
      "step": 318
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 856.0,
      "completions/mean_length": 170.62054443359375,
      "completions/mean_terminated_length": 151.13697814941406,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.10891553924288336,
      "grad_norm": 0.023129483684897423,
      "kl": 0.01513051986694336,
      "learning_rate": 8.349746890119826e-06,
      "loss": 0.0071,
      "num_tokens": 113825511.0,
      "reward": 0.4296068251132965,
      "reward_std": 0.054870616644620895,
      "rewards/gemini_judge_reward_func/mean": 0.1283482164144516,
      "rewards/gemini_judge_reward_func/std": 0.2571618854999542,
      "rewards/semantic_correctness_reward_func/mean": 0.4613375663757324,
      "rewards/semantic_correctness_reward_func/std": 0.20210479199886322,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 319
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 918.0,
      "completions/mean_length": 161.46429443359375,
      "completions/mean_terminated_length": 145.7818145751953,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.10925696726558833,
      "grad_norm": 0.021242870017886162,
      "kl": 0.015431404113769531,
      "learning_rate": 8.337453124270864e-06,
      "loss": 0.0266,
      "num_tokens": 114167051.0,
      "reward": 0.4469578266143799,
      "reward_std": 0.07928242534399033,
      "rewards/gemini_judge_reward_func/mean": 0.1506696492433548,
      "rewards/gemini_judge_reward_func/std": 0.2936350107192993,
      "rewards/semantic_correctness_reward_func/mean": 0.3984406888484955,
      "rewards/semantic_correctness_reward_func/std": 0.2311331182718277,
      "rewards/xmlcount_reward_func/mean": 0.7675045132637024,
      "rewards/xmlcount_reward_func/std": 0.42328277230262756,
      "step": 320
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 827.0,
      "completions/mean_length": 161.5357208251953,
      "completions/mean_terminated_length": 145.8545379638672,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.1095983952882933,
      "grad_norm": 0.02152419462800026,
      "kl": 0.016777515411376953,
      "learning_rate": 8.325122860569241e-06,
      "loss": -0.004,
      "num_tokens": 114533951.0,
      "reward": 0.4059690237045288,
      "reward_std": 0.05474109575152397,
      "rewards/gemini_judge_reward_func/mean": 0.1238839253783226,
      "rewards/gemini_judge_reward_func/std": 0.23317579925060272,
      "rewards/semantic_correctness_reward_func/mean": 0.4235771596431732,
      "rewards/semantic_correctness_reward_func/std": 0.21103839576244354,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 321
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 655.0,
      "completions/mean_length": 161.73214721679688,
      "completions/mean_terminated_length": 150.02716064453125,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.10993982331099825,
      "grad_norm": 0.021091651171445847,
      "kl": 0.012056350708007812,
      "learning_rate": 8.31275623385675e-06,
      "loss": 0.0014,
      "num_tokens": 114893815.0,
      "reward": 0.4708484411239624,
      "reward_std": 0.06425228714942932,
      "rewards/gemini_judge_reward_func/mean": 0.1540178507566452,
      "rewards/gemini_judge_reward_func/std": 0.26837313175201416,
      "rewards/semantic_correctness_reward_func/mean": 0.4374563992023468,
      "rewards/semantic_correctness_reward_func/std": 0.2451786994934082,
      "rewards/xmlcount_reward_func/mean": 0.8043750524520874,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 322
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 763.0,
      "completions/mean_length": 157.95089721679688,
      "completions/mean_terminated_length": 138.17807006835938,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.11028125133370321,
      "grad_norm": 0.02196911908686161,
      "kl": 0.01731395721435547,
      "learning_rate": 8.300353379372834e-06,
      "loss": -0.0236,
      "num_tokens": 115274700.0,
      "reward": 0.3980475962162018,
      "reward_std": 0.05808022618293762,
      "rewards/gemini_judge_reward_func/mean": 0.1205357164144516,
      "rewards/gemini_judge_reward_func/std": 0.225807324051857,
      "rewards/semantic_correctness_reward_func/mean": 0.4579342305660248,
      "rewards/semantic_correctness_reward_func/std": 0.18658004701137543,
      "rewards/xmlcount_reward_func/mean": 0.6456161141395569,
      "rewards/xmlcount_reward_func/std": 0.4789053201675415,
      "step": 323
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0357142857142857,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 595.0,
      "completions/mean_length": 169.99554443359375,
      "completions/mean_terminated_length": 138.36573791503906,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.11062267935640818,
      "grad_norm": 0.020508458837866783,
      "kl": 0.014233112335205078,
      "learning_rate": 8.287914432753123e-06,
      "loss": 0.0114,
      "num_tokens": 115661423.0,
      "reward": 0.3872678577899933,
      "reward_std": 0.05701467767357826,
      "rewards/gemini_judge_reward_func/mean": 0.0803571417927742,
      "rewards/gemini_judge_reward_func/std": 0.2042548656463623,
      "rewards/semantic_correctness_reward_func/mean": 0.41712480783462524,
      "rewards/semantic_correctness_reward_func/std": 0.1884647160768509,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 324
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 983.0,
      "completions/mean_length": 163.37054443359375,
      "completions/mean_terminated_length": 143.72145080566406,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.11096410737911314,
      "grad_norm": 0.01983817107975483,
      "kl": 0.013721704483032227,
      "learning_rate": 8.275439530027948e-06,
      "loss": 0.0108,
      "num_tokens": 116015134.0,
      "reward": 0.440729558467865,
      "reward_std": 0.05826781690120697,
      "rewards/gemini_judge_reward_func/mean": 0.1830357164144516,
      "rewards/gemini_judge_reward_func/std": 0.3022378087043762,
      "rewards/semantic_correctness_reward_func/mean": 0.4790761172771454,
      "rewards/semantic_correctness_reward_func/std": 0.2402830719947815,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853896975517273,
      "step": 325
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 878.0,
      "completions/mean_length": 170.1919708251953,
      "completions/mean_terminated_length": 154.66818237304688,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.1113055354018181,
      "grad_norm": 0.019503232091665268,
      "kl": 0.013273954391479492,
      "learning_rate": 8.262928807620843e-06,
      "loss": -0.0252,
      "num_tokens": 116375149.0,
      "reward": 0.44340530037879944,
      "reward_std": 0.05507688969373703,
      "rewards/gemini_judge_reward_func/mean": 0.1127232164144516,
      "rewards/gemini_judge_reward_func/std": 0.21272562444210052,
      "rewards/semantic_correctness_reward_func/mean": 0.4498923718929291,
      "rewards/semantic_correctness_reward_func/std": 0.21063929796218872,
      "rewards/xmlcount_reward_func/mean": 0.7708438038825989,
      "rewards/xmlcount_reward_func/std": 0.4208168685436249,
      "step": 326
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 307.0,
      "completions/mean_length": 165.9866180419922,
      "completions/mean_terminated_length": 142.37155151367188,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.11164696342452307,
      "grad_norm": 0.02120453119277954,
      "kl": 0.016252994537353516,
      "learning_rate": 8.250382402347066e-06,
      "loss": 0.0012,
      "num_tokens": 116728550.0,
      "reward": 0.4338933527469635,
      "reward_std": 0.06330207735300064,
      "rewards/gemini_judge_reward_func/mean": 0.0904017835855484,
      "rewards/gemini_judge_reward_func/std": 0.1992909461259842,
      "rewards/semantic_correctness_reward_func/mean": 0.41566306352615356,
      "rewards/semantic_correctness_reward_func/std": 0.1946808099746704,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 327
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 341.0,
      "completions/mean_length": 142.6428680419922,
      "completions/mean_terminated_length": 138.6905975341797,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.11198839144722803,
      "grad_norm": 0.022184768691658974,
      "kl": 0.015604972839355469,
      "learning_rate": 8.237800451412095e-06,
      "loss": -0.0052,
      "num_tokens": 117078914.0,
      "reward": 0.4221014678478241,
      "reward_std": 0.07018353044986725,
      "rewards/gemini_judge_reward_func/mean": 0.1283482164144516,
      "rewards/gemini_judge_reward_func/std": 0.2614849805831909,
      "rewards/semantic_correctness_reward_func/mean": 0.4238108694553375,
      "rewards/semantic_correctness_reward_func/std": 0.18395309150218964,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 328
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 327.0,
      "completions/mean_length": 160.99554443359375,
      "completions/mean_terminated_length": 133.15667724609375,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.11232981946993299,
      "grad_norm": 0.022962870076298714,
      "kl": 0.016230106353759766,
      "learning_rate": 8.225183092410128e-06,
      "loss": -0.0089,
      "num_tokens": 117432929.0,
      "reward": 0.38007089495658875,
      "reward_std": 0.057407211512327194,
      "rewards/gemini_judge_reward_func/mean": 0.1149553582072258,
      "rewards/gemini_judge_reward_func/std": 0.22438155114650726,
      "rewards/semantic_correctness_reward_func/mean": 0.41198840737342834,
      "rewards/semantic_correctness_reward_func/std": 0.2041979283094406,
      "rewards/xmlcount_reward_func/mean": 0.6292276978492737,
      "rewards/xmlcount_reward_func/std": 0.4825986325740814,
      "step": 329
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 846.0,
      "completions/max_terminated_length": 846.0,
      "completions/mean_length": 142.21429443359375,
      "completions/mean_terminated_length": 142.21429443359375,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.11267124749263796,
      "grad_norm": 0.025767182931303978,
      "kl": 0.015546798706054688,
      "learning_rate": 8.212530463322584e-06,
      "loss": 0.0039,
      "num_tokens": 117773413.0,
      "reward": 0.4443262815475464,
      "reward_std": 0.0675266683101654,
      "rewards/gemini_judge_reward_func/mean": 0.1517857164144516,
      "rewards/gemini_judge_reward_func/std": 0.26012489199638367,
      "rewards/semantic_correctness_reward_func/mean": 0.45230987668037415,
      "rewards/semantic_correctness_reward_func/std": 0.2161291390657425,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 330
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 428.0,
      "completions/mean_length": 148.66964721679688,
      "completions/mean_terminated_length": 140.78378295898438,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.11301267551534293,
      "grad_norm": 0.022938484326004982,
      "kl": 0.01628732681274414,
      "learning_rate": 8.199842702516584e-06,
      "loss": -0.0092,
      "num_tokens": 118108139.0,
      "reward": 0.4258379638195038,
      "reward_std": 0.04991302639245987,
      "rewards/gemini_judge_reward_func/mean": 0.1205357164144516,
      "rewards/gemini_judge_reward_func/std": 0.225807324051857,
      "rewards/semantic_correctness_reward_func/mean": 0.4223683178424835,
      "rewards/semantic_correctness_reward_func/std": 0.20527078211307526,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 331
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 944.0,
      "completions/mean_length": 152.6741180419922,
      "completions/mean_terminated_length": 144.82432556152344,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.11335410353804788,
      "grad_norm": 0.02302442491054535,
      "kl": 0.017078876495361328,
      "learning_rate": 8.18711994874345e-06,
      "loss": 0.0042,
      "num_tokens": 118459982.0,
      "reward": 0.4548938274383545,
      "reward_std": 0.07729874551296234,
      "rewards/gemini_judge_reward_func/mean": 0.1462053507566452,
      "rewards/gemini_judge_reward_func/std": 0.248531773686409,
      "rewards/semantic_correctness_reward_func/mean": 0.4805581271648407,
      "rewards/semantic_correctness_reward_func/std": 0.2108439952135086,
      "rewards/xmlcount_reward_func/mean": 0.7507500648498535,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 332
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 351.0,
      "completions/mean_length": 163.9375,
      "completions/mean_terminated_length": 148.29998779296875,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.11369553156075285,
      "grad_norm": 0.02004287578165531,
      "kl": 0.013605833053588867,
      "learning_rate": 8.174362341137177e-06,
      "loss": -0.0139,
      "num_tokens": 118833092.0,
      "reward": 0.46299633383750916,
      "reward_std": 0.06390392780303955,
      "rewards/gemini_judge_reward_func/mean": 0.1662946492433548,
      "rewards/gemini_judge_reward_func/std": 0.2740088105201721,
      "rewards/semantic_correctness_reward_func/mean": 0.4640171229839325,
      "rewards/semantic_correctness_reward_func/std": 0.22488431632518768,
      "rewards/xmlcount_reward_func/mean": 0.7591875791549683,
      "rewards/xmlcount_reward_func/std": 0.42421701550483704,
      "step": 333
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 284.0,
      "completions/mean_length": 140.8616180419922,
      "completions/mean_terminated_length": 128.87330627441406,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.11403695958345782,
      "grad_norm": 0.01933170109987259,
      "kl": 0.01584768295288086,
      "learning_rate": 8.161570019212921e-06,
      "loss": -0.0117,
      "num_tokens": 119202209.0,
      "reward": 0.3510363698005676,
      "reward_std": 0.045246776193380356,
      "rewards/gemini_judge_reward_func/mean": 0.0837053582072258,
      "rewards/gemini_judge_reward_func/std": 0.18177036941051483,
      "rewards/semantic_correctness_reward_func/mean": 0.37227097153663635,
      "rewards/semantic_correctness_reward_func/std": 0.1903991997241974,
      "rewards/xmlcount_reward_func/mean": 0.6077500581741333,
      "rewards/xmlcount_reward_func/std": 0.48996880650520325,
      "step": 334
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 881.0,
      "completions/mean_length": 156.82144165039062,
      "completions/mean_terminated_length": 145.04977416992188,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.11437838760616277,
      "grad_norm": 0.020192094147205353,
      "kl": 0.017360687255859375,
      "learning_rate": 8.148743122865463e-06,
      "loss": 0.0078,
      "num_tokens": 119574049.0,
      "reward": 0.3997800946235657,
      "reward_std": 0.06350252032279968,
      "rewards/gemini_judge_reward_func/mean": 0.1529017835855484,
      "rewards/gemini_judge_reward_func/std": 0.2915138602256775,
      "rewards/semantic_correctness_reward_func/mean": 0.4418465793132782,
      "rewards/semantic_correctness_reward_func/std": 0.21829283237457275,
      "rewards/xmlcount_reward_func/mean": 0.6256250739097595,
      "rewards/xmlcount_reward_func/std": 0.48569241166114807,
      "step": 335
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 641.0,
      "completions/mean_length": 149.9732208251953,
      "completions/mean_terminated_length": 142.09910583496094,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.11471981562886774,
      "grad_norm": 0.021611372008919716,
      "kl": 0.017676830291748047,
      "learning_rate": 8.135881792367686e-06,
      "loss": 0.0081,
      "num_tokens": 119918387.0,
      "reward": 0.4103807508945465,
      "reward_std": 0.05042886361479759,
      "rewards/gemini_judge_reward_func/mean": 0.1015625,
      "rewards/gemini_judge_reward_func/std": 0.20777438580989838,
      "rewards/semantic_correctness_reward_func/mean": 0.38929662108421326,
      "rewards/semantic_correctness_reward_func/std": 0.2001960128545761,
      "rewards/xmlcount_reward_func/mean": 0.729741096496582,
      "rewards/xmlcount_reward_func/std": 0.4440862536430359,
      "step": 336
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 828.0,
      "completions/mean_length": 160.75,
      "completions/mean_terminated_length": 152.9729766845703,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.1150612436515727,
      "grad_norm": 0.020709911361336708,
      "kl": 0.01323080062866211,
      "learning_rate": 8.12298616836904e-06,
      "loss": -0.0084,
      "num_tokens": 120264343.0,
      "reward": 0.44805604219436646,
      "reward_std": 0.06720651686191559,
      "rewards/gemini_judge_reward_func/mean": 0.1116071417927742,
      "rewards/gemini_judge_reward_func/std": 0.23990675806999207,
      "rewards/semantic_correctness_reward_func/mean": 0.44406577944755554,
      "rewards/semantic_correctness_reward_func/std": 0.20191948115825653,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 337
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 523.0,
      "completions/mean_length": 148.8125,
      "completions/mean_terminated_length": 144.8878936767578,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.11540267167427766,
      "grad_norm": 0.021978365257382393,
      "kl": 0.014225482940673828,
      "learning_rate": 8.110056391894005e-06,
      "loss": 0.0029,
      "num_tokens": 120599325.0,
      "reward": 0.44396767020225525,
      "reward_std": 0.06658326089382172,
      "rewards/gemini_judge_reward_func/mean": 0.0825892835855484,
      "rewards/gemini_judge_reward_func/std": 0.19055145978927612,
      "rewards/semantic_correctness_reward_func/mean": 0.41909700632095337,
      "rewards/semantic_correctness_reward_func/std": 0.20795229077339172,
      "rewards/xmlcount_reward_func/mean": 0.8177813291549683,
      "rewards/xmlcount_reward_func/std": 0.3879494369029999,
      "step": 338
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 980.0,
      "completions/mean_length": 151.71875,
      "completions/mean_terminated_length": 147.80718994140625,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.11574409969698263,
      "grad_norm": 0.022666750475764275,
      "kl": 0.017026901245117188,
      "learning_rate": 8.097092604340543e-06,
      "loss": -0.004,
      "num_tokens": 120944658.0,
      "reward": 0.45621275901794434,
      "reward_std": 0.07161962240934372,
      "rewards/gemini_judge_reward_func/mean": 0.1495535671710968,
      "rewards/gemini_judge_reward_func/std": 0.27193009853363037,
      "rewards/semantic_correctness_reward_func/mean": 0.43579572439193726,
      "rewards/semantic_correctness_reward_func/std": 0.23262245953083038,
      "rewards/xmlcount_reward_func/mean": 0.7730804085731506,
      "rewards/xmlcount_reward_func/std": 0.41802364587783813,
      "step": 339
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 153.79464721679688,
      "completions/mean_terminated_length": 149.8923797607422,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.1160855277196876,
      "grad_norm": 0.02177649922668934,
      "kl": 0.016196727752685547,
      "learning_rate": 8.084094947478556e-06,
      "loss": -0.0169,
      "num_tokens": 121279912.0,
      "reward": 0.4234395921230316,
      "reward_std": 0.05121118947863579,
      "rewards/gemini_judge_reward_func/mean": 0.0825892835855484,
      "rewards/gemini_judge_reward_func/std": 0.18151207268238068,
      "rewards/semantic_correctness_reward_func/mean": 0.37901926040649414,
      "rewards/semantic_correctness_reward_func/std": 0.20515023171901703,
      "rewards/xmlcount_reward_func/mean": 0.7864999771118164,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 340
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 798.0,
      "completions/mean_length": 146.62054443359375,
      "completions/mean_terminated_length": 142.6861114501953,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.11642695574239256,
      "grad_norm": 0.022630201652646065,
      "kl": 0.01521444320678711,
      "learning_rate": 8.071063563448341e-06,
      "loss": 0.0077,
      "num_tokens": 121619223.0,
      "reward": 0.4508695602416992,
      "reward_std": 0.07667838037014008,
      "rewards/gemini_judge_reward_func/mean": 0.1238839253783226,
      "rewards/gemini_judge_reward_func/std": 0.2615041136741638,
      "rewards/semantic_correctness_reward_func/mean": 0.4436066150665283,
      "rewards/semantic_correctness_reward_func/std": 0.20674681663513184,
      "rewards/xmlcount_reward_func/mean": 0.7814866304397583,
      "rewards/xmlcount_reward_func/std": 0.41452744603157043,
      "step": 341
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 366.0,
      "completions/mean_length": 155.80357360839844,
      "completions/mean_terminated_length": 140.01817321777344,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.11676838376509752,
      "grad_norm": 0.020520929247140884,
      "kl": 0.01617908477783203,
      "learning_rate": 8.057998594759022e-06,
      "loss": -0.0048,
      "num_tokens": 121980211.0,
      "reward": 0.4046533703804016,
      "reward_std": 0.05556685850024223,
      "rewards/gemini_judge_reward_func/mean": 0.0993303582072258,
      "rewards/gemini_judge_reward_func/std": 0.2317548543214798,
      "rewards/semantic_correctness_reward_func/mean": 0.43035584688186646,
      "rewards/semantic_correctness_reward_func/std": 0.18309862911701202,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903892993927,
      "step": 342
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 302.0,
      "completions/mean_length": 161.95089721679688,
      "completions/mean_terminated_length": 134.14285278320312,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.11710981178780248,
      "grad_norm": 0.02210637740790844,
      "kl": 0.017457008361816406,
      "learning_rate": 8.044900184287007e-06,
      "loss": -0.0198,
      "num_tokens": 122351324.0,
      "reward": 0.39233168959617615,
      "reward_std": 0.05195912346243858,
      "rewards/gemini_judge_reward_func/mean": 0.1395089328289032,
      "rewards/gemini_judge_reward_func/std": 0.268510639667511,
      "rewards/semantic_correctness_reward_func/mean": 0.4501582682132721,
      "rewards/semantic_correctness_reward_func/std": 0.2134057879447937,
      "rewards/xmlcount_reward_func/mean": 0.6162410378456116,
      "rewards/xmlcount_reward_func/std": 0.4834427833557129,
      "step": 343
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 710.0,
      "completions/mean_length": 156.8303680419922,
      "completions/mean_terminated_length": 149.0180206298828,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.11745123981050745,
      "grad_norm": 0.02535112388432026,
      "kl": 0.014339447021484375,
      "learning_rate": 8.031768475274412e-06,
      "loss": -0.0396,
      "num_tokens": 122719626.0,
      "reward": 0.49485448002815247,
      "reward_std": 0.07723495364189148,
      "rewards/gemini_judge_reward_func/mean": 0.1908482164144516,
      "rewards/gemini_judge_reward_func/std": 0.29018884897232056,
      "rewards/semantic_correctness_reward_func/mean": 0.4480757713317871,
      "rewards/semantic_correctness_reward_func/std": 0.20120421051979065,
      "rewards/xmlcount_reward_func/mean": 0.8222500681877136,
      "rewards/xmlcount_reward_func/std": 0.3842346966266632,
      "step": 344
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 317.0,
      "completions/mean_length": 144.80804443359375,
      "completions/mean_terminated_length": 136.88739013671875,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.1177926678332124,
      "grad_norm": 0.023620422929525375,
      "kl": 0.01935100555419922,
      "learning_rate": 8.018603611327505e-06,
      "loss": 0.0273,
      "num_tokens": 123068043.0,
      "reward": 0.436502605676651,
      "reward_std": 0.06161291524767876,
      "rewards/gemini_judge_reward_func/mean": 0.1417410671710968,
      "rewards/gemini_judge_reward_func/std": 0.2755933105945587,
      "rewards/semantic_correctness_reward_func/mean": 0.4363076984882355,
      "rewards/semantic_correctness_reward_func/std": 0.21314512193202972,
      "rewards/xmlcount_reward_func/mean": 0.7313615679740906,
      "rewards/xmlcount_reward_func/std": 0.4439382255077362,
      "step": 345
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 764.0,
      "completions/mean_length": 146.9107208251953,
      "completions/mean_terminated_length": 139.00901794433594,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.11813409585591737,
      "grad_norm": 0.023379402235150337,
      "kl": 0.020755767822265625,
      "learning_rate": 8.005405736415127e-06,
      "loss": 0.0179,
      "num_tokens": 123455955.0,
      "reward": 0.4183342158794403,
      "reward_std": 0.06782528012990952,
      "rewards/gemini_judge_reward_func/mean": 0.1227678582072258,
      "rewards/gemini_judge_reward_func/std": 0.22830908000469208,
      "rewards/semantic_correctness_reward_func/mean": 0.4518852233886719,
      "rewards/semantic_correctness_reward_func/std": 0.18604160845279694,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903892993927,
      "step": 346
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 452.0,
      "completions/mean_length": 150.29464721679688,
      "completions/mean_terminated_length": 142.42343139648438,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.11847552387862234,
      "grad_norm": 0.021145131438970566,
      "kl": 0.017850875854492188,
      "learning_rate": 7.992174994867124e-06,
      "loss": -0.0141,
      "num_tokens": 123812385.0,
      "reward": 0.4235004186630249,
      "reward_std": 0.07467382401227951,
      "rewards/gemini_judge_reward_func/mean": 0.1316964328289032,
      "rewards/gemini_judge_reward_func/std": 0.26034605503082275,
      "rewards/semantic_correctness_reward_func/mean": 0.4553859531879425,
      "rewards/semantic_correctness_reward_func/std": 0.19646060466766357,
      "rewards/xmlcount_reward_func/mean": 0.6993616819381714,
      "rewards/xmlcount_reward_func/std": 0.4591045677661896,
      "step": 347
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 272.0,
      "completions/max_terminated_length": 272.0,
      "completions/mean_length": 137.13394165039062,
      "completions/mean_terminated_length": 137.13394165039062,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.1188169519013273,
      "grad_norm": 0.023312104865908623,
      "kl": 0.018787860870361328,
      "learning_rate": 7.978911531372764e-06,
      "loss": 0.003,
      "num_tokens": 124133127.0,
      "reward": 0.4589785635471344,
      "reward_std": 0.06582622230052948,
      "rewards/gemini_judge_reward_func/mean": 0.1283482164144516,
      "rewards/gemini_judge_reward_func/std": 0.2379140406847,
      "rewards/semantic_correctness_reward_func/mean": 0.4205087721347809,
      "rewards/semantic_correctness_reward_func/std": 0.19802919030189514,
      "rewards/xmlcount_reward_func/mean": 0.8088437914848328,
      "rewards/xmlcount_reward_func/std": 0.3951219618320465,
      "step": 348
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 709.0,
      "completions/mean_length": 148.7366180419922,
      "completions/mean_terminated_length": 140.85134887695312,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.11915837992403226,
      "grad_norm": 0.0231174249202013,
      "kl": 0.025996685028076172,
      "learning_rate": 7.965615490979165e-06,
      "loss": -0.0209,
      "num_tokens": 124506008.0,
      "reward": 0.42070716619491577,
      "reward_std": 0.054260022938251495,
      "rewards/gemini_judge_reward_func/mean": 0.1071428582072258,
      "rewards/gemini_judge_reward_func/std": 0.22137534618377686,
      "rewards/semantic_correctness_reward_func/mean": 0.4056251347064972,
      "rewards/semantic_correctness_reward_func/std": 0.1975453943014145,
      "rewards/xmlcount_reward_func/mean": 0.7418125867843628,
      "rewards/xmlcount_reward_func/std": 0.4394664168357849,
      "step": 349
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 796.0,
      "completions/mean_length": 162.75,
      "completions/mean_terminated_length": 143.0867462158203,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.11949980794673723,
      "grad_norm": 0.019620204344391823,
      "kl": 0.014969110488891602,
      "learning_rate": 7.952287019089686e-06,
      "loss": -0.0175,
      "num_tokens": 124862112.0,
      "reward": 0.4290351867675781,
      "reward_std": 0.059172313660383224,
      "rewards/gemini_judge_reward_func/mean": 0.1149553582072258,
      "rewards/gemini_judge_reward_func/std": 0.21019315719604492,
      "rewards/semantic_correctness_reward_func/mean": 0.3958899676799774,
      "rewards/semantic_correctness_reward_func/std": 0.21406039595603943,
      "rewards/xmlcount_reward_func/mean": 0.7596875429153442,
      "rewards/xmlcount_reward_func/std": 0.4264892339706421,
      "step": 350
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 944.0,
      "completions/mean_length": 180.13394165039062,
      "completions/mean_terminated_length": 156.9082489013672,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.1198412359694422,
      "grad_norm": 0.02199356071650982,
      "kl": 0.016332149505615234,
      "learning_rate": 7.938926261462366e-06,
      "loss": 0.0142,
      "num_tokens": 125222638.0,
      "reward": 0.39731401205062866,
      "reward_std": 0.04134167358279228,
      "rewards/gemini_judge_reward_func/mean": 0.0758928582072258,
      "rewards/gemini_judge_reward_func/std": 0.18289919197559357,
      "rewards/semantic_correctness_reward_func/mean": 0.40478435158729553,
      "rewards/semantic_correctness_reward_func/std": 0.18352636694908142,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 351
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 279.0,
      "completions/mean_length": 149.4375,
      "completions/mean_terminated_length": 141.55856323242188,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.12018266399214715,
      "grad_norm": 0.023317914456129074,
      "kl": 0.017638683319091797,
      "learning_rate": 7.925533364208308e-06,
      "loss": 0.0142,
      "num_tokens": 125537380.0,
      "reward": 0.4412147104740143,
      "reward_std": 0.07194562256336212,
      "rewards/gemini_judge_reward_func/mean": 0.1171875,
      "rewards/gemini_judge_reward_func/std": 0.21941961348056793,
      "rewards/semantic_correctness_reward_func/mean": 0.4344485104084015,
      "rewards/semantic_correctness_reward_func/std": 0.2174639254808426,
      "rewards/xmlcount_reward_func/mean": 0.7686249613761902,
      "rewards/xmlcount_reward_func/std": 0.42356839776039124,
      "step": 352
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 157.3303680419922,
      "completions/mean_terminated_length": 153.44395446777344,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.12052409201485212,
      "grad_norm": 0.02054346166551113,
      "kl": 0.013754844665527344,
      "learning_rate": 7.912108473790092e-06,
      "loss": 0.0265,
      "num_tokens": 125890782.0,
      "reward": 0.4809158742427826,
      "reward_std": 0.07998733222484589,
      "rewards/gemini_judge_reward_func/mean": 0.1685267835855484,
      "rewards/gemini_judge_reward_func/std": 0.26428356766700745,
      "rewards/semantic_correctness_reward_func/mean": 0.4587755799293518,
      "rewards/semantic_correctness_reward_func/std": 0.20430655777454376,
      "rewards/xmlcount_reward_func/mean": 0.8043751120567322,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 353
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 497.0,
      "completions/mean_length": 151.66964721679688,
      "completions/mean_terminated_length": 147.7578582763672,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.12086552003755709,
      "grad_norm": 0.019831150770187378,
      "kl": 0.013668537139892578,
      "learning_rate": 7.898651737020166e-06,
      "loss": -0.0238,
      "num_tokens": 126230728.0,
      "reward": 0.4762882590293884,
      "reward_std": 0.07177340984344482,
      "rewards/gemini_judge_reward_func/mean": 0.1395089328289032,
      "rewards/gemini_judge_reward_func/std": 0.24901461601257324,
      "rewards/semantic_correctness_reward_func/mean": 0.45792320370674133,
      "rewards/semantic_correctness_reward_func/std": 0.21523572504520416,
      "rewards/xmlcount_reward_func/mean": 0.8222500681877136,
      "rewards/xmlcount_reward_func/std": 0.3842346966266632,
      "step": 354
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0357142857142857,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 806.0,
      "completions/mean_length": 184.16964721679688,
      "completions/mean_terminated_length": 153.0648193359375,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.12120694806026204,
      "grad_norm": 0.02033446729183197,
      "kl": 0.013987541198730469,
      "learning_rate": 7.885163301059251e-06,
      "loss": 0.0056,
      "num_tokens": 126579194.0,
      "reward": 0.4241471588611603,
      "reward_std": 0.045106999576091766,
      "rewards/gemini_judge_reward_func/mean": 0.0814732164144516,
      "rewards/gemini_judge_reward_func/std": 0.2058626413345337,
      "rewards/semantic_correctness_reward_func/mean": 0.3758426010608673,
      "rewards/semantic_correctness_reward_func/std": 0.1890416443347931,
      "rewards/xmlcount_reward_func/mean": 0.7909732460975647,
      "rewards/xmlcount_reward_func/std": 0.4057386815547943,
      "step": 355
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 663.0,
      "completions/mean_length": 148.33482360839844,
      "completions/mean_terminated_length": 140.44595336914062,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.12154837608296701,
      "grad_norm": 0.022476162761449814,
      "kl": 0.015032291412353516,
      "learning_rate": 7.871643313414718e-06,
      "loss": -0.0142,
      "num_tokens": 126935869.0,
      "reward": 0.4261614680290222,
      "reward_std": 0.0577315129339695,
      "rewards/gemini_judge_reward_func/mean": 0.1026785746216774,
      "rewards/gemini_judge_reward_func/std": 0.22223277390003204,
      "rewards/semantic_correctness_reward_func/mean": 0.43737873435020447,
      "rewards/semantic_correctness_reward_func/std": 0.19673730432987213,
      "rewards/xmlcount_reward_func/mean": 0.7440357208251953,
      "rewards/xmlcount_reward_func/std": 0.43435025215148926,
      "step": 356
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 256.0,
      "completions/mean_length": 148.95089721679688,
      "completions/mean_terminated_length": 141.06756591796875,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.12188980410567198,
      "grad_norm": 0.023637736216187477,
      "kl": 0.016416549682617188,
      "learning_rate": 7.858091921938989e-06,
      "loss": -0.0125,
      "num_tokens": 127282282.0,
      "reward": 0.4587040841579437,
      "reward_std": 0.07693413645029068,
      "rewards/gemini_judge_reward_func/mean": 0.1763392835855484,
      "rewards/gemini_judge_reward_func/std": 0.3062107563018799,
      "rewards/semantic_correctness_reward_func/mean": 0.47061866521835327,
      "rewards/semantic_correctness_reward_func/std": 0.21844151616096497,
      "rewards/xmlcount_reward_func/mean": 0.735111653804779,
      "rewards/xmlcount_reward_func/std": 0.4418267011642456,
      "step": 357
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 370.0,
      "completions/mean_length": 151.4553680419922,
      "completions/mean_terminated_length": 143.5946044921875,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.12223123212837694,
      "grad_norm": 0.023121589794754982,
      "kl": 0.017974853515625,
      "learning_rate": 7.844509274827907e-06,
      "loss": 0.0005,
      "num_tokens": 127638940.0,
      "reward": 0.41608354449272156,
      "reward_std": 0.05893407762050629,
      "rewards/gemini_judge_reward_func/mean": 0.0926339253783226,
      "rewards/gemini_judge_reward_func/std": 0.2171497493982315,
      "rewards/semantic_correctness_reward_func/mean": 0.4293998181819916,
      "rewards/semantic_correctness_reward_func/std": 0.20812109112739563,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 358
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 481.0,
      "completions/mean_length": 159.6428680419922,
      "completions/mean_terminated_length": 147.90951538085938,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.1225726601510819,
      "grad_norm": 0.02072990871965885,
      "kl": 0.015190601348876953,
      "learning_rate": 7.830895520619129e-06,
      "loss": -0.0122,
      "num_tokens": 127996884.0,
      "reward": 0.4240740239620209,
      "reward_std": 0.06780924648046494,
      "rewards/gemini_judge_reward_func/mean": 0.1071428582072258,
      "rewards/gemini_judge_reward_func/std": 0.2175440937280655,
      "rewards/semantic_correctness_reward_func/mean": 0.40458425879478455,
      "rewards/semantic_correctness_reward_func/std": 0.21169018745422363,
      "rewards/xmlcount_reward_func/mean": 0.7507500648498535,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 359
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 818.0,
      "completions/mean_length": 158.33929443359375,
      "completions/mean_terminated_length": 142.59999084472656,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.12291408817378686,
      "grad_norm": 0.02120456099510193,
      "kl": 0.01677846908569336,
      "learning_rate": 7.817250808190483e-06,
      "loss": -0.0013,
      "num_tokens": 128351412.0,
      "reward": 0.419956237077713,
      "reward_std": 0.06337090581655502,
      "rewards/gemini_judge_reward_func/mean": 0.1127232164144516,
      "rewards/gemini_judge_reward_func/std": 0.24459390342235565,
      "rewards/semantic_correctness_reward_func/mean": 0.4085846543312073,
      "rewards/semantic_correctness_reward_func/std": 0.20613060891628265,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 360
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0357142857142857,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 298.0,
      "completions/mean_length": 169.60714721679688,
      "completions/mean_terminated_length": 137.9629669189453,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.12325551619649183,
      "grad_norm": 0.022048698738217354,
      "kl": 0.018438339233398438,
      "learning_rate": 7.803575286758365e-06,
      "loss": -0.0032,
      "num_tokens": 128731148.0,
      "reward": 0.41158369183540344,
      "reward_std": 0.06956712901592255,
      "rewards/gemini_judge_reward_func/mean": 0.1439732164144516,
      "rewards/gemini_judge_reward_func/std": 0.2618865966796875,
      "rewards/semantic_correctness_reward_func/mean": 0.4472218155860901,
      "rewards/semantic_correctness_reward_func/std": 0.21267808973789215,
      "rewards/xmlcount_reward_func/mean": 0.6613750457763672,
      "rewards/xmlcount_reward_func/std": 0.47500187158584595,
      "step": 361
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 906.0,
      "completions/mean_length": 168.65625,
      "completions/mean_terminated_length": 153.1045379638672,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.12359694421919679,
      "grad_norm": 0.02125149965286255,
      "kl": 0.016147613525390625,
      "learning_rate": 7.789869105876083e-06,
      "loss": -0.0622,
      "num_tokens": 129093431.0,
      "reward": 0.43265652656555176,
      "reward_std": 0.07214810699224472,
      "rewards/gemini_judge_reward_func/mean": 0.1395089328289032,
      "rewards/gemini_judge_reward_func/std": 0.2621731162071228,
      "rewards/semantic_correctness_reward_func/mean": 0.4043269753456116,
      "rewards/semantic_correctness_reward_func/std": 0.2230450063943863,
      "rewards/xmlcount_reward_func/mean": 0.7399688363075256,
      "rewards/xmlcount_reward_func/std": 0.43580862879753113,
      "step": 362
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 928.0,
      "completions/mean_length": 158.22769165039062,
      "completions/mean_terminated_length": 146.47511291503906,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.12393837224190175,
      "grad_norm": 0.021313535049557686,
      "kl": 0.01935601234436035,
      "learning_rate": 7.776132415432234e-06,
      "loss": -0.0092,
      "num_tokens": 129453758.0,
      "reward": 0.4301838278770447,
      "reward_std": 0.0693366751074791,
      "rewards/gemini_judge_reward_func/mean": 0.1886160671710968,
      "rewards/gemini_judge_reward_func/std": 0.30937135219573975,
      "rewards/semantic_correctness_reward_func/mean": 0.4406421482563019,
      "rewards/semantic_correctness_reward_func/std": 0.23391857743263245,
      "rewards/xmlcount_reward_func/mean": 0.6665223836898804,
      "rewards/xmlcount_reward_func/std": 0.46999096870422363,
      "step": 363
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 913.0,
      "completions/mean_length": 158.55357360839844,
      "completions/mean_terminated_length": 134.7339324951172,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.12427980026460672,
      "grad_norm": 0.02203877456486225,
      "kl": 0.02167510986328125,
      "learning_rate": 7.762365365649068e-06,
      "loss": -0.0115,
      "num_tokens": 129829190.0,
      "reward": 0.3783107399940491,
      "reward_std": 0.055117614567279816,
      "rewards/gemini_judge_reward_func/mean": 0.0959821417927742,
      "rewards/gemini_judge_reward_func/std": 0.22271645069122314,
      "rewards/semantic_correctness_reward_func/mean": 0.44833922386169434,
      "rewards/semantic_correctness_reward_func/std": 0.17805521190166473,
      "rewards/xmlcount_reward_func/mean": 0.6256250143051147,
      "rewards/xmlcount_reward_func/std": 0.48569241166114807,
      "step": 364
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 361.0,
      "completions/mean_length": 154.47769165039062,
      "completions/mean_terminated_length": 150.57847595214844,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.12462122828731168,
      "grad_norm": 0.021525070071220398,
      "kl": 0.015169620513916016,
      "learning_rate": 7.748568107080831e-06,
      "loss": -0.015,
      "num_tokens": 130180113.0,
      "reward": 0.4638497829437256,
      "reward_std": 0.05021931231021881,
      "rewards/gemini_judge_reward_func/mean": 0.1004464253783226,
      "rewards/gemini_judge_reward_func/std": 0.22575192153453827,
      "rewards/semantic_correctness_reward_func/mean": 0.40235573053359985,
      "rewards/semantic_correctness_reward_func/std": 0.215322345495224,
      "rewards/xmlcount_reward_func/mean": 0.8580000996589661,
      "rewards/xmlcount_reward_func/std": 0.35106155276298523,
      "step": 365
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 410.0,
      "completions/mean_length": 161.45089721679688,
      "completions/mean_terminated_length": 141.7579803466797,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.12496265631001664,
      "grad_norm": 0.020651815459132195,
      "kl": 0.01579141616821289,
      "learning_rate": 7.734740790612137e-06,
      "loss": -0.0091,
      "num_tokens": 130554842.0,
      "reward": 0.37947192788124084,
      "reward_std": 0.05589864403009415,
      "rewards/gemini_judge_reward_func/mean": 0.0758928582072258,
      "rewards/gemini_judge_reward_func/std": 0.1798083484172821,
      "rewards/semantic_correctness_reward_func/mean": 0.3870737552642822,
      "rewards/semantic_correctness_reward_func/std": 0.18478237092494965,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 366
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 907.0,
      "completions/mean_length": 158.6116180419922,
      "completions/mean_terminated_length": 154.73095703125,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.1253040843327216,
      "grad_norm": 0.019816717132925987,
      "kl": 0.016491413116455078,
      "learning_rate": 7.720883567456299e-06,
      "loss": -0.0226,
      "num_tokens": 130900519.0,
      "reward": 0.43501195311546326,
      "reward_std": 0.06687616556882858,
      "rewards/gemini_judge_reward_func/mean": 0.1383928507566452,
      "rewards/gemini_judge_reward_func/std": 0.24338625371456146,
      "rewards/semantic_correctness_reward_func/mean": 0.43252378702163696,
      "rewards/semantic_correctness_reward_func/std": 0.20659232139587402,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 367
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 398.0,
      "completions/mean_length": 151.45982360839844,
      "completions/mean_terminated_length": 143.59910583496094,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.12564551235542656,
      "grad_norm": 0.02021671086549759,
      "kl": 0.017315387725830078,
      "learning_rate": 7.70699658915369e-06,
      "loss": -0.0132,
      "num_tokens": 131242978.0,
      "reward": 0.45236364006996155,
      "reward_std": 0.07268624007701874,
      "rewards/gemini_judge_reward_func/mean": 0.1685267835855484,
      "rewards/gemini_judge_reward_func/std": 0.292473703622818,
      "rewards/semantic_correctness_reward_func/mean": 0.4635234475135803,
      "rewards/semantic_correctness_reward_func/std": 0.20471033453941345,
      "rewards/xmlcount_reward_func/mean": 0.7306205630302429,
      "rewards/xmlcount_reward_func/std": 0.4441879093647003,
      "step": 368
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 967.0,
      "completions/mean_length": 167.49554443359375,
      "completions/mean_terminated_length": 151.9227294921875,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.12598694037813155,
      "grad_norm": 0.023807330057024956,
      "kl": 0.01745128631591797,
      "learning_rate": 7.693080007570084e-06,
      "loss": 0.0224,
      "num_tokens": 131625237.0,
      "reward": 0.4182495176792145,
      "reward_std": 0.07036899775266647,
      "rewards/gemini_judge_reward_func/mean": 0.1551339328289032,
      "rewards/gemini_judge_reward_func/std": 0.26403728127479553,
      "rewards/semantic_correctness_reward_func/mean": 0.4224795699119568,
      "rewards/semantic_correctness_reward_func/std": 0.2183416783809662,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 369
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 318.0,
      "completions/mean_length": 148.90179443359375,
      "completions/mean_terminated_length": 141.0180206298828,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.1263283684008365,
      "grad_norm": 0.02168549969792366,
      "kl": 0.016880512237548828,
      "learning_rate": 7.679133974894984e-06,
      "loss": -0.0272,
      "num_tokens": 131959631.0,
      "reward": 0.42804205417633057,
      "reward_std": 0.05581650137901306,
      "rewards/gemini_judge_reward_func/mean": 0.1104910746216774,
      "rewards/gemini_judge_reward_func/std": 0.24675332009792328,
      "rewards/semantic_correctness_reward_func/mean": 0.4177280068397522,
      "rewards/semantic_correctness_reward_func/std": 0.20709756016731262,
      "rewards/xmlcount_reward_func/mean": 0.7507500648498535,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 370
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 773.0,
      "completions/mean_length": 156.0357208251953,
      "completions/mean_terminated_length": 148.21621704101562,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.12666979642354145,
      "grad_norm": 0.02178873121738434,
      "kl": 0.01481771469116211,
      "learning_rate": 7.66515864363997e-06,
      "loss": -0.0008,
      "num_tokens": 132312491.0,
      "reward": 0.475494921207428,
      "reward_std": 0.0693972259759903,
      "rewards/gemini_judge_reward_func/mean": 0.1662946492433548,
      "rewards/gemini_judge_reward_func/std": 0.2516138553619385,
      "rewards/semantic_correctness_reward_func/mean": 0.47188514471054077,
      "rewards/semantic_correctness_reward_func/std": 0.2055116444826126,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 371
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 399.0,
      "completions/mean_length": 148.68304443359375,
      "completions/mean_terminated_length": 140.79730224609375,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.12701122444624643,
      "grad_norm": 0.021552162244915962,
      "kl": 0.01751708984375,
      "learning_rate": 7.651154166637025e-06,
      "loss": 0.0188,
      "num_tokens": 132678580.0,
      "reward": 0.4199696183204651,
      "reward_std": 0.059298258274793625,
      "rewards/gemini_judge_reward_func/mean": 0.1171875,
      "rewards/gemini_judge_reward_func/std": 0.23184122145175934,
      "rewards/semantic_correctness_reward_func/mean": 0.44441038370132446,
      "rewards/semantic_correctness_reward_func/std": 0.20885135233402252,
      "rewards/xmlcount_reward_func/mean": 0.7105312943458557,
      "rewards/xmlcount_reward_func/std": 0.4553159773349762,
      "step": 372
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 901.0,
      "completions/mean_length": 160.8169708251953,
      "completions/mean_terminated_length": 156.94619750976562,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.1273526524689514,
      "grad_norm": 0.021256279200315475,
      "kl": 0.01796579360961914,
      "learning_rate": 7.637120697036866e-06,
      "loss": 0.0054,
      "num_tokens": 133042543.0,
      "reward": 0.40777140855789185,
      "reward_std": 0.0612851157784462,
      "rewards/gemini_judge_reward_func/mean": 0.1037946417927742,
      "rewards/gemini_judge_reward_func/std": 0.20393303036689758,
      "rewards/semantic_correctness_reward_func/mean": 0.4012675881385803,
      "rewards/semantic_correctness_reward_func/std": 0.19751423597335815,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 373
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0357142857142857,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 956.0,
      "completions/mean_length": 180.0982208251953,
      "completions/mean_terminated_length": 148.84259033203125,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.12769408049165634,
      "grad_norm": 0.021484747529029846,
      "kl": 0.015058517456054688,
      "learning_rate": 7.62305838830727e-06,
      "loss": -0.019,
      "num_tokens": 133408461.0,
      "reward": 0.3818875849246979,
      "reward_std": 0.05296236649155617,
      "rewards/gemini_judge_reward_func/mean": 0.0870535746216774,
      "rewards/gemini_judge_reward_func/std": 0.20286573469638824,
      "rewards/semantic_correctness_reward_func/mean": 0.3946877419948578,
      "rewards/semantic_correctness_reward_func/std": 0.17419084906578064,
      "rewards/xmlcount_reward_func/mean": 0.6703214645385742,
      "rewards/xmlcount_reward_func/std": 0.4670778214931488,
      "step": 374
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 893.0,
      "completions/mean_length": 148.1116180419922,
      "completions/mean_terminated_length": 144.18386840820312,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.12803550851436132,
      "grad_norm": 0.021499551832675934,
      "kl": 0.018602371215820312,
      "learning_rate": 7.608967394231387e-06,
      "loss": -0.0345,
      "num_tokens": 133775478.0,
      "reward": 0.41469037532806396,
      "reward_std": 0.04859733209013939,
      "rewards/gemini_judge_reward_func/mean": 0.125,
      "rewards/gemini_judge_reward_func/std": 0.2426035851240158,
      "rewards/semantic_correctness_reward_func/mean": 0.42921966314315796,
      "rewards/semantic_correctness_reward_func/std": 0.20824337005615234,
      "rewards/xmlcount_reward_func/mean": 0.6971160769462585,
      "rewards/xmlcount_reward_func/std": 0.46128448843955994,
      "step": 375
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 838.0,
      "completions/mean_length": 160.82589721679688,
      "completions/mean_terminated_length": 141.11871337890625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.12837693653706628,
      "grad_norm": 0.021708445623517036,
      "kl": 0.017911672592163086,
      "learning_rate": 7.594847868906076e-06,
      "loss": -0.0095,
      "num_tokens": 134139351.0,
      "reward": 0.42823106050491333,
      "reward_std": 0.06395815312862396,
      "rewards/gemini_judge_reward_func/mean": 0.1361607164144516,
      "rewards/gemini_judge_reward_func/std": 0.26552340388298035,
      "rewards/semantic_correctness_reward_func/mean": 0.43883365392684937,
      "rewards/semantic_correctness_reward_func/std": 0.21176576614379883,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 376
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 890.0,
      "completions/mean_length": 162.5357208251953,
      "completions/mean_terminated_length": 146.8727264404297,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.12871836455977123,
      "grad_norm": 0.020864030346274376,
      "kl": 0.01667308807373047,
      "learning_rate": 7.580699966740201e-06,
      "loss": 0.0403,
      "num_tokens": 134527599.0,
      "reward": 0.4344416558742523,
      "reward_std": 0.07495336979627609,
      "rewards/gemini_judge_reward_func/mean": 0.1808035671710968,
      "rewards/gemini_judge_reward_func/std": 0.2884351909160614,
      "rewards/semantic_correctness_reward_func/mean": 0.4521009027957916,
      "rewards/semantic_correctness_reward_func/std": 0.21390804648399353,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 377
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 757.0,
      "completions/mean_length": 176.68751525878906,
      "completions/mean_terminated_length": 157.3424530029297,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.1290597925824762,
      "grad_norm": 0.01891087181866169,
      "kl": 0.016262054443359375,
      "learning_rate": 7.566523842452958e-06,
      "loss": -0.0123,
      "num_tokens": 134889049.0,
      "reward": 0.4385979175567627,
      "reward_std": 0.0712980329990387,
      "rewards/gemini_judge_reward_func/mean": 0.1216517835855484,
      "rewards/gemini_judge_reward_func/std": 0.2482900321483612,
      "rewards/semantic_correctness_reward_func/mean": 0.44818589091300964,
      "rewards/semantic_correctness_reward_func/std": 0.20277683436870575,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 378
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 270.0,
      "completions/mean_length": 155.92857360839844,
      "completions/mean_terminated_length": 144.14480590820312,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.12940122060518117,
      "grad_norm": 0.02066349983215332,
      "kl": 0.013991832733154297,
      "learning_rate": 7.552319651072164e-06,
      "loss": -0.0142,
      "num_tokens": 135215325.0,
      "reward": 0.4548727571964264,
      "reward_std": 0.058319687843322754,
      "rewards/gemini_judge_reward_func/mean": 0.1037946417927742,
      "rewards/gemini_judge_reward_func/std": 0.22107842564582825,
      "rewards/semantic_correctness_reward_func/mean": 0.4222742021083832,
      "rewards/semantic_correctness_reward_func/std": 0.19840273261070251,
      "rewards/xmlcount_reward_func/mean": 0.8222500681877136,
      "rewards/xmlcount_reward_func/std": 0.3842346966266632,
      "step": 379
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 747.0,
      "completions/mean_length": 163.63394165039062,
      "completions/mean_terminated_length": 147.99090576171875,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.12974264862788612,
      "grad_norm": 0.019953692331910133,
      "kl": 0.016832828521728516,
      "learning_rate": 7.5380875479325855e-06,
      "loss": -0.0387,
      "num_tokens": 135580519.0,
      "reward": 0.47378772497177124,
      "reward_std": 0.06329935044050217,
      "rewards/gemini_judge_reward_func/mean": 0.1830357164144516,
      "rewards/gemini_judge_reward_func/std": 0.2889639437198639,
      "rewards/semantic_correctness_reward_func/mean": 0.4656168818473816,
      "rewards/semantic_correctness_reward_func/std": 0.25280624628067017,
      "rewards/xmlcount_reward_func/mean": 0.7686249613761902,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 380
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 562.0,
      "completions/mean_length": 156.5669708251953,
      "completions/mean_terminated_length": 152.67713928222656,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.1300840766505911,
      "grad_norm": 0.01964419148862362,
      "kl": 0.017740726470947266,
      "learning_rate": 7.52382768867422e-06,
      "loss": 0.007,
      "num_tokens": 135927274.0,
      "reward": 0.4654104709625244,
      "reward_std": 0.05828892067074776,
      "rewards/gemini_judge_reward_func/mean": 0.1149553582072258,
      "rewards/gemini_judge_reward_func/std": 0.219328373670578,
      "rewards/semantic_correctness_reward_func/mean": 0.4302842915058136,
      "rewards/semantic_correctness_reward_func/std": 0.21055085957050323,
      "rewards/xmlcount_reward_func/mean": 0.833428680896759,
      "rewards/xmlcount_reward_func/std": 0.37002047896385193,
      "step": 381
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 305.0,
      "completions/mean_length": 148.58929443359375,
      "completions/mean_terminated_length": 136.7058868408203,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.13042550467329606,
      "grad_norm": 0.019918564707040787,
      "kl": 0.01675271987915039,
      "learning_rate": 7.509540229240601e-06,
      "loss": -0.0218,
      "num_tokens": 136290766.0,
      "reward": 0.3735648989677429,
      "reward_std": 0.05948694050312042,
      "rewards/gemini_judge_reward_func/mean": 0.1060267835855484,
      "rewards/gemini_judge_reward_func/std": 0.21485590934753418,
      "rewards/semantic_correctness_reward_func/mean": 0.44027090072631836,
      "rewards/semantic_correctness_reward_func/std": 0.20193372666835785,
      "rewards/xmlcount_reward_func/mean": 0.6077500581741333,
      "rewards/xmlcount_reward_func/std": 0.48996880650520325,
      "step": 382
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 407.0,
      "completions/mean_length": 151.4241180419922,
      "completions/mean_terminated_length": 143.5630645751953,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.13076693269600104,
      "grad_norm": 0.02249898388981819,
      "kl": 0.019742965698242188,
      "learning_rate": 7.4952253258771036e-06,
      "loss": 0.0399,
      "num_tokens": 136652045.0,
      "reward": 0.436825156211853,
      "reward_std": 0.056539103388786316,
      "rewards/gemini_judge_reward_func/mean": 0.1685267835855484,
      "rewards/gemini_judge_reward_func/std": 0.29533451795578003,
      "rewards/semantic_correctness_reward_func/mean": 0.4528220593929291,
      "rewards/semantic_correctness_reward_func/std": 0.21148590743541718,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903594970703,
      "step": 383
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 935.0,
      "completions/mean_length": 161.2857208251953,
      "completions/mean_terminated_length": 153.51351928710938,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.131108360718706,
      "grad_norm": 0.02023939974606037,
      "kl": 0.015448570251464844,
      "learning_rate": 7.480883135129211e-06,
      "loss": -0.0074,
      "num_tokens": 137023725.0,
      "reward": 0.4137805700302124,
      "reward_std": 0.05417332798242569,
      "rewards/gemini_judge_reward_func/mean": 0.0926339253783226,
      "rewards/gemini_judge_reward_func/std": 0.18509900569915771,
      "rewards/semantic_correctness_reward_func/mean": 0.41341152787208557,
      "rewards/semantic_correctness_reward_func/std": 0.19928321242332458,
      "rewards/xmlcount_reward_func/mean": 0.735111653804779,
      "rewards/xmlcount_reward_func/std": 0.4418267011642456,
      "step": 384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 731.0,
      "completions/mean_length": 145.07589721679688,
      "completions/mean_terminated_length": 141.13453674316406,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.13144978874141094,
      "grad_norm": 0.022299103438854218,
      "kl": 0.018901348114013672,
      "learning_rate": 7.4665138138408255e-06,
      "loss": 0.0139,
      "num_tokens": 137355434.0,
      "reward": 0.4558815360069275,
      "reward_std": 0.07524207979440689,
      "rewards/gemini_judge_reward_func/mean": 0.1462053507566452,
      "rewards/gemini_judge_reward_func/std": 0.26490774750709534,
      "rewards/semantic_correctness_reward_func/mean": 0.45868438482284546,
      "rewards/semantic_correctness_reward_func/std": 0.23558257520198822,
      "rewards/xmlcount_reward_func/mean": 0.7641563415527344,
      "rewards/xmlcount_reward_func/std": 0.4263768792152405,
      "step": 385
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 655.0,
      "completions/mean_length": 160.26339721679688,
      "completions/mean_terminated_length": 148.53846740722656,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.13179121676411593,
      "grad_norm": 0.022259045392274857,
      "kl": 0.017290592193603516,
      "learning_rate": 7.452117519152542e-06,
      "loss": 0.0129,
      "num_tokens": 137707349.0,
      "reward": 0.4224461615085602,
      "reward_std": 0.05426723137497902,
      "rewards/gemini_judge_reward_func/mean": 0.0892857164144516,
      "rewards/gemini_judge_reward_func/std": 0.1782989650964737,
      "rewards/semantic_correctness_reward_func/mean": 0.39640921354293823,
      "rewards/semantic_correctness_reward_func/std": 0.18904563784599304,
      "rewards/xmlcount_reward_func/mean": 0.7686249613761902,
      "rewards/xmlcount_reward_func/std": 0.42356839776039124,
      "step": 386
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 305.0,
      "completions/mean_length": 156.45982360839844,
      "completions/mean_terminated_length": 144.68325805664062,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.13213264478682088,
      "grad_norm": 0.02163863182067871,
      "kl": 0.01602315902709961,
      "learning_rate": 7.437694408499932e-06,
      "loss": 0.0092,
      "num_tokens": 138051268.0,
      "reward": 0.43781083822250366,
      "reward_std": 0.05744494870305061,
      "rewards/gemini_judge_reward_func/mean": 0.1205357164144516,
      "rewards/gemini_judge_reward_func/std": 0.23313553631305695,
      "rewards/semantic_correctness_reward_func/mean": 0.4464828670024872,
      "rewards/semantic_correctness_reward_func/std": 0.19655469059944153,
      "rewards/xmlcount_reward_func/mean": 0.7507500648498535,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 387
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 397.0,
      "completions/mean_length": 152.40179443359375,
      "completions/mean_terminated_length": 144.549560546875,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.13247407280952583,
      "grad_norm": 0.022210579365491867,
      "kl": 0.017316818237304688,
      "learning_rate": 7.4232446396118265e-06,
      "loss": -0.0113,
      "num_tokens": 138391914.0,
      "reward": 0.4606318771839142,
      "reward_std": 0.08661013096570969,
      "rewards/gemini_judge_reward_func/mean": 0.1529017835855484,
      "rewards/gemini_judge_reward_func/std": 0.2621540129184723,
      "rewards/semantic_correctness_reward_func/mean": 0.42435577511787415,
      "rewards/semantic_correctness_reward_func/std": 0.20760102570056915,
      "rewards/xmlcount_reward_func/mean": 0.786500096321106,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 388
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 355.0,
      "completions/mean_length": 149.21875,
      "completions/mean_terminated_length": 145.2959747314453,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.13281550083223082,
      "grad_norm": 0.021780794486403465,
      "kl": 0.016431808471679688,
      "learning_rate": 7.408768370508577e-06,
      "loss": -0.02,
      "num_tokens": 138750395.0,
      "reward": 0.42754751443862915,
      "reward_std": 0.05521192029118538,
      "rewards/gemini_judge_reward_func/mean": 0.0915178582072258,
      "rewards/gemini_judge_reward_func/std": 0.1980723738670349,
      "rewards/semantic_correctness_reward_func/mean": 0.4174516499042511,
      "rewards/semantic_correctness_reward_func/std": 0.20846490561962128,
      "rewards/xmlcount_reward_func/mean": 0.7686249613761902,
      "rewards/xmlcount_reward_func/std": 0.42356839776039124,
      "step": 389
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 600.0,
      "completions/mean_length": 161.19644165039062,
      "completions/mean_terminated_length": 145.5090789794922,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.13315692885493577,
      "grad_norm": 0.021414414048194885,
      "kl": 0.018945693969726562,
      "learning_rate": 7.394265759500348e-06,
      "loss": -0.0034,
      "num_tokens": 139114735.0,
      "reward": 0.41808784008026123,
      "reward_std": 0.06346622854471207,
      "rewards/gemini_judge_reward_func/mean": 0.1194196417927742,
      "rewards/gemini_judge_reward_func/std": 0.2319059669971466,
      "rewards/semantic_correctness_reward_func/mean": 0.4215996563434601,
      "rewards/semantic_correctness_reward_func/std": 0.21271347999572754,
      "rewards/xmlcount_reward_func/mean": 0.7150000929832458,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 390
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 822.0,
      "completions/mean_length": 167.0625,
      "completions/mean_terminated_length": 155.42987060546875,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.13349835687764072,
      "grad_norm": 0.02051488868892193,
      "kl": 0.016121387481689453,
      "learning_rate": 7.379736965185369e-06,
      "loss": -0.0242,
      "num_tokens": 139485993.0,
      "reward": 0.44687801599502563,
      "reward_std": 0.055957481265068054,
      "rewards/gemini_judge_reward_func/mean": 0.0926339253783226,
      "rewards/gemini_judge_reward_func/std": 0.18204548954963684,
      "rewards/semantic_correctness_reward_func/mean": 0.4403719902038574,
      "rewards/semantic_correctness_reward_func/std": 0.19233807921409607,
      "rewards/xmlcount_reward_func/mean": 0.8043750524520874,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 391
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 641.0,
      "completions/mean_length": 150.4375,
      "completions/mean_terminated_length": 142.56756591796875,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.1338397849003457,
      "grad_norm": 0.025922337546944618,
      "kl": 0.018519878387451172,
      "learning_rate": 7.365182146448205e-06,
      "loss": -0.0162,
      "num_tokens": 139818011.0,
      "reward": 0.4373648762702942,
      "reward_std": 0.05904890224337578,
      "rewards/gemini_judge_reward_func/mean": 0.1316964328289032,
      "rewards/gemini_judge_reward_func/std": 0.23428121209144592,
      "rewards/semantic_correctness_reward_func/mean": 0.4219311773777008,
      "rewards/semantic_correctness_reward_func/std": 0.224158376455307,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 392
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 270.0,
      "completions/mean_length": 146.89732360839844,
      "completions/mean_terminated_length": 142.96412658691406,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.13418121292305066,
      "grad_norm": 0.021265504881739616,
      "kl": 0.016716480255126953,
      "learning_rate": 7.350601462458025e-06,
      "loss": -0.0039,
      "num_tokens": 140184364.0,
      "reward": 0.4281903803348541,
      "reward_std": 0.0652979239821434,
      "rewards/gemini_judge_reward_func/mean": 0.1138392835855484,
      "rewards/gemini_judge_reward_func/std": 0.2514272928237915,
      "rewards/semantic_correctness_reward_func/mean": 0.4117732644081116,
      "rewards/semantic_correctness_reward_func/std": 0.1989647001028061,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 393
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 622.0,
      "completions/mean_length": 159.44644165039062,
      "completions/mean_terminated_length": 143.72726440429688,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.1345226409457556,
      "grad_norm": 0.022965986281633377,
      "kl": 0.019566059112548828,
      "learning_rate": 7.335995072666848e-06,
      "loss": -0.0162,
      "num_tokens": 140515804.0,
      "reward": 0.4194537103176117,
      "reward_std": 0.04903746023774147,
      "rewards/gemini_judge_reward_func/mean": 0.109375,
      "rewards/gemini_judge_reward_func/std": 0.20716224610805511,
      "rewards/semantic_correctness_reward_func/mean": 0.41276854276657104,
      "rewards/semantic_correctness_reward_func/std": 0.18567071855068207,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 394
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 161.39732360839844,
      "completions/mean_terminated_length": 141.70318603515625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.1348640689684606,
      "grad_norm": 0.02179262414574623,
      "kl": 0.02160930633544922,
      "learning_rate": 7.3213631368078196e-06,
      "loss": -0.0011,
      "num_tokens": 140892281.0,
      "reward": 0.38270094990730286,
      "reward_std": 0.05515586584806442,
      "rewards/gemini_judge_reward_func/mean": 0.1272321492433548,
      "rewards/gemini_judge_reward_func/std": 0.26785045862197876,
      "rewards/semantic_correctness_reward_func/mean": 0.43906697630882263,
      "rewards/semantic_correctness_reward_func/std": 0.21814100444316864,
      "rewards/xmlcount_reward_func/mean": 0.6099866628646851,
      "rewards/xmlcount_reward_func/std": 0.4883228540420532,
      "step": 395
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 368.0,
      "completions/mean_length": 168.22769165039062,
      "completions/mean_terminated_length": 148.6894989013672,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.13520549699116555,
      "grad_norm": 0.021188482642173767,
      "kl": 0.012853145599365234,
      "learning_rate": 7.30670581489344e-06,
      "loss": -0.0209,
      "num_tokens": 141243504.0,
      "reward": 0.4657913148403168,
      "reward_std": 0.07301204651594162,
      "rewards/gemini_judge_reward_func/mean": 0.1551339328289032,
      "rewards/gemini_judge_reward_func/std": 0.3035410940647125,
      "rewards/semantic_correctness_reward_func/mean": 0.4099385440349579,
      "rewards/semantic_correctness_reward_func/std": 0.24554020166397095,
      "rewards/xmlcount_reward_func/mean": 0.8043751120567322,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 396
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 812.0,
      "completions/mean_length": 149.60269165039062,
      "completions/mean_terminated_length": 145.68162536621094,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.1355469250138705,
      "grad_norm": 0.021987369284033775,
      "kl": 0.018671512603759766,
      "learning_rate": 7.292023267213836e-06,
      "loss": -0.0344,
      "num_tokens": 141591435.0,
      "reward": 0.452779084444046,
      "reward_std": 0.06523489207029343,
      "rewards/gemini_judge_reward_func/mean": 0.1361607164144516,
      "rewards/gemini_judge_reward_func/std": 0.2388509213924408,
      "rewards/semantic_correctness_reward_func/mean": 0.45432382822036743,
      "rewards/semantic_correctness_reward_func/std": 0.19478566944599152,
      "rewards/xmlcount_reward_func/mean": 0.7686249613761902,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 397
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 395.0,
      "completions/mean_length": 152.66519165039062,
      "completions/mean_terminated_length": 144.81532287597656,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.13588835303657548,
      "grad_norm": 0.02154485322535038,
      "kl": 0.01664876937866211,
      "learning_rate": 7.2773156543349965e-06,
      "loss": 0.0017,
      "num_tokens": 141945044.0,
      "reward": 0.4172542691230774,
      "reward_std": 0.07213146984577179,
      "rewards/gemini_judge_reward_func/mean": 0.1238839253783226,
      "rewards/gemini_judge_reward_func/std": 0.2615041136741638,
      "rewards/semantic_correctness_reward_func/mean": 0.4085032045841217,
      "rewards/semantic_correctness_reward_func/std": 0.20544031262397766,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 398
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 555.0,
      "completions/mean_length": 151.21429443359375,
      "completions/mean_terminated_length": 139.36651611328125,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.13622978105928044,
      "grad_norm": 0.020291056483983994,
      "kl": 0.018874645233154297,
      "learning_rate": 7.262583137097019e-06,
      "loss": 0.0093,
      "num_tokens": 142311220.0,
      "reward": 0.44641584157943726,
      "reward_std": 0.0624687597155571,
      "rewards/gemini_judge_reward_func/mean": 0.140625,
      "rewards/gemini_judge_reward_func/std": 0.2599613070487976,
      "rewards/semantic_correctness_reward_func/mean": 0.4493291676044464,
      "rewards/semantic_correctness_reward_func/std": 0.2035626322031021,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 399
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0401785714285714,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 663.0,
      "completions/mean_length": 174.49554443359375,
      "completions/mean_terminated_length": 138.9348907470703,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.13657120908198542,
      "grad_norm": 0.02120777778327465,
      "kl": 0.021905899047851562,
      "learning_rate": 7.247825876612353e-06,
      "loss": -0.0367,
      "num_tokens": 142682655.0,
      "reward": 0.38896819949150085,
      "reward_std": 0.06701021641492844,
      "rewards/gemini_judge_reward_func/mean": 0.140625,
      "rewards/gemini_judge_reward_func/std": 0.257796049118042,
      "rewards/semantic_correctness_reward_func/mean": 0.45702823996543884,
      "rewards/semantic_correctness_reward_func/std": 0.19833700358867645,
      "rewards/xmlcount_reward_func/mean": 0.6032813191413879,
      "rewards/xmlcount_reward_func/std": 0.4909299612045288,
      "step": 400
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 874.0,
      "completions/mean_length": 171.41964721679688,
      "completions/mean_terminated_length": 159.84616088867188,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.13691263710469037,
      "grad_norm": 0.022050578147172928,
      "kl": 0.025957345962524414,
      "learning_rate": 7.233044034264034e-06,
      "loss": 0.0136,
      "num_tokens": 143029509.0,
      "reward": 0.4459562301635742,
      "reward_std": 0.06850647926330566,
      "rewards/gemini_judge_reward_func/mean": 0.1417410671710968,
      "rewards/gemini_judge_reward_func/std": 0.24202269315719604,
      "rewards/semantic_correctness_reward_func/mean": 0.44479867815971375,
      "rewards/semantic_correctness_reward_func/std": 0.20681588351726532,
      "rewards/xmlcount_reward_func/mean": 0.7507500648498535,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 401
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 608.0,
      "completions/mean_length": 159.51339721679688,
      "completions/mean_terminated_length": 139.7762451171875,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.13725406512739532,
      "grad_norm": 0.02276882901787758,
      "kl": 0.028842449188232422,
      "learning_rate": 7.218237771703921e-06,
      "loss": -0.0022,
      "num_tokens": 143394860.0,
      "reward": 0.38758131861686707,
      "reward_std": 0.05766326189041138,
      "rewards/gemini_judge_reward_func/mean": 0.1171875,
      "rewards/gemini_judge_reward_func/std": 0.21161696314811707,
      "rewards/semantic_correctness_reward_func/mean": 0.4165315628051758,
      "rewards/semantic_correctness_reward_func/std": 0.19921529293060303,
      "rewards/xmlcount_reward_func/mean": 0.643500030040741,
      "rewards/xmlcount_reward_func/std": 0.48071083426475525,
      "step": 402
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 457.0,
      "completions/mean_length": 155.125,
      "completions/mean_terminated_length": 147.29730224609375,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.1375954931501003,
      "grad_norm": 0.02148452028632164,
      "kl": 0.01662302017211914,
      "learning_rate": 7.203407250850929e-06,
      "loss": 0.0048,
      "num_tokens": 143717172.0,
      "reward": 0.4796033799648285,
      "reward_std": 0.08433418720960617,
      "rewards/gemini_judge_reward_func/mean": 0.1674107164144516,
      "rewards/gemini_judge_reward_func/std": 0.26340386271476746,
      "rewards/semantic_correctness_reward_func/mean": 0.4901953339576721,
      "rewards/semantic_correctness_reward_func/std": 0.21127957105636597,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 403
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 349.0,
      "completions/mean_length": 148.8303680419922,
      "completions/mean_terminated_length": 140.94595336914062,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.13793692117280526,
      "grad_norm": 0.020280931144952774,
      "kl": 0.016697406768798828,
      "learning_rate": 7.18855263388926e-06,
      "loss": 0.0017,
      "num_tokens": 144081082.0,
      "reward": 0.449037104845047,
      "reward_std": 0.06599867343902588,
      "rewards/gemini_judge_reward_func/mean": 0.1629464328289032,
      "rewards/gemini_judge_reward_func/std": 0.25325196981430054,
      "rewards/semantic_correctness_reward_func/mean": 0.4557745158672333,
      "rewards/semantic_correctness_reward_func/std": 0.19886869192123413,
      "rewards/xmlcount_reward_func/mean": 0.7317589521408081,
      "rewards/xmlcount_reward_func/std": 0.4439156949520111,
      "step": 404
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 952.0,
      "completions/mean_length": 165.49107360839844,
      "completions/mean_terminated_length": 145.89041137695312,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.13827834919551021,
      "grad_norm": 0.020092271268367767,
      "kl": 0.015337467193603516,
      "learning_rate": 7.173674083266624e-06,
      "loss": -0.024,
      "num_tokens": 144430580.0,
      "reward": 0.4049227833747864,
      "reward_std": 0.05651836097240448,
      "rewards/gemini_judge_reward_func/mean": 0.1138392835855484,
      "rewards/gemini_judge_reward_func/std": 0.2141004502773285,
      "rewards/semantic_correctness_reward_func/mean": 0.4339620769023895,
      "rewards/semantic_correctness_reward_func/std": 0.19319763779640198,
      "rewards/xmlcount_reward_func/mean": 0.6814866662025452,
      "rewards/xmlcount_reward_func/std": 0.46647319197654724,
      "step": 405
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 746.0,
      "completions/mean_length": 163.99107360839844,
      "completions/mean_terminated_length": 152.31674194335938,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.1386197772182152,
      "grad_norm": 0.021386247128248215,
      "kl": 0.019238710403442383,
      "learning_rate": 7.158771761692464e-06,
      "loss": -0.0153,
      "num_tokens": 144785990.0,
      "reward": 0.42315545678138733,
      "reward_std": 0.058707475662231445,
      "rewards/gemini_judge_reward_func/mean": 0.1205357164144516,
      "rewards/gemini_judge_reward_func/std": 0.25714486837387085,
      "rewards/semantic_correctness_reward_func/mean": 0.4447057247161865,
      "rewards/semantic_correctness_reward_func/std": 0.2132960706949234,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 406
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 786.0,
      "completions/mean_length": 158.23214721679688,
      "completions/mean_terminated_length": 146.47964477539062,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.13896120524092015,
      "grad_norm": 0.020573705434799194,
      "kl": 0.018062591552734375,
      "learning_rate": 7.143845832136188e-06,
      "loss": -0.0133,
      "num_tokens": 145152102.0,
      "reward": 0.4384137988090515,
      "reward_std": 0.07652968168258667,
      "rewards/gemini_judge_reward_func/mean": 0.1361607164144516,
      "rewards/gemini_judge_reward_func/std": 0.25474947690963745,
      "rewards/semantic_correctness_reward_func/mean": 0.46782782673835754,
      "rewards/semantic_correctness_reward_func/std": 0.21055324375629425,
      "rewards/xmlcount_reward_func/mean": 0.7259598970413208,
      "rewards/xmlcount_reward_func/std": 0.4518895447254181,
      "step": 407
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 848.0,
      "completions/mean_length": 179.15626525878906,
      "completions/mean_terminated_length": 155.90365600585938,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.1393026332636251,
      "grad_norm": 0.021146338433027267,
      "kl": 0.0170745849609375,
      "learning_rate": 7.128896457825364e-06,
      "loss": -0.0303,
      "num_tokens": 145496605.0,
      "reward": 0.4385666251182556,
      "reward_std": 0.054036956280469894,
      "rewards/gemini_judge_reward_func/mean": 0.1116071417927742,
      "rewards/gemini_judge_reward_func/std": 0.22669215500354767,
      "rewards/semantic_correctness_reward_func/mean": 0.43236854672431946,
      "rewards/semantic_correctness_reward_func/std": 0.21931228041648865,
      "rewards/xmlcount_reward_func/mean": 0.7686249613761902,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 408
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 871.0,
      "completions/mean_length": 171.75894165039062,
      "completions/mean_terminated_length": 152.30136108398438,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.13964406128633008,
      "grad_norm": 0.020863203331828117,
      "kl": 0.017315387725830078,
      "learning_rate": 7.113923802243957e-06,
      "loss": -0.0304,
      "num_tokens": 145874323.0,
      "reward": 0.4249069094657898,
      "reward_std": 0.06283921003341675,
      "rewards/gemini_judge_reward_func/mean": 0.1529017835855484,
      "rewards/gemini_judge_reward_func/std": 0.276716947555542,
      "rewards/semantic_correctness_reward_func/mean": 0.46023085713386536,
      "rewards/semantic_correctness_reward_func/std": 0.20974647998809814,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 409
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 163.0357208251953,
      "completions/mean_terminated_length": 143.3789825439453,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.13998548930903504,
      "grad_norm": 0.01941607892513275,
      "kl": 0.01681995391845703,
      "learning_rate": 7.098928029130529e-06,
      "loss": 0.0088,
      "num_tokens": 146230799.0,
      "reward": 0.3921462297439575,
      "reward_std": 0.06177087500691414,
      "rewards/gemini_judge_reward_func/mean": 0.1127232164144516,
      "rewards/gemini_judge_reward_func/std": 0.23643691837787628,
      "rewards/semantic_correctness_reward_func/mean": 0.44157034158706665,
      "rewards/semantic_correctness_reward_func/std": 0.1995597928762436,
      "rewards/xmlcount_reward_func/mean": 0.6468572020530701,
      "rewards/xmlcount_reward_func/std": 0.47763964533805847,
      "step": 410
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 517.0,
      "completions/mean_length": 152.8616180419922,
      "completions/mean_terminated_length": 137.02272033691406,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.14032691733174,
      "grad_norm": 0.02037540264427662,
      "kl": 0.020148277282714844,
      "learning_rate": 7.083909302476453e-06,
      "loss": -0.0013,
      "num_tokens": 146610804.0,
      "reward": 0.3764539062976837,
      "reward_std": 0.07178690284490585,
      "rewards/gemini_judge_reward_func/mean": 0.1283482164144516,
      "rewards/gemini_judge_reward_func/std": 0.25606971979141235,
      "rewards/semantic_correctness_reward_func/mean": 0.41007286310195923,
      "rewards/semantic_correctness_reward_func/std": 0.2045731246471405,
      "rewards/xmlcount_reward_func/mean": 0.6077500581741333,
      "rewards/xmlcount_reward_func/std": 0.48996880650520325,
      "step": 411
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 808.0,
      "completions/mean_length": 149.24554443359375,
      "completions/mean_terminated_length": 141.3648681640625,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.14066834535444497,
      "grad_norm": 0.02088170126080513,
      "kl": 0.015942096710205078,
      "learning_rate": 7.068867786524116e-06,
      "loss": -0.0028,
      "num_tokens": 146958603.0,
      "reward": 0.4193098247051239,
      "reward_std": 0.04512748494744301,
      "rewards/gemini_judge_reward_func/mean": 0.1037946417927742,
      "rewards/gemini_judge_reward_func/std": 0.22855830192565918,
      "rewards/semantic_correctness_reward_func/mean": 0.4232097566127777,
      "rewards/semantic_correctness_reward_func/std": 0.22072601318359375,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 412
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 945.0,
      "completions/mean_length": 153.6919708251953,
      "completions/mean_terminated_length": 149.78924560546875,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.14100977337714993,
      "grad_norm": 0.020876459777355194,
      "kl": 0.020923137664794922,
      "learning_rate": 7.053803645765128e-06,
      "loss": 0.0084,
      "num_tokens": 147314350.0,
      "reward": 0.43098554015159607,
      "reward_std": 0.053628940135240555,
      "rewards/gemini_judge_reward_func/mean": 0.1618303507566452,
      "rewards/gemini_judge_reward_func/std": 0.2943499982357025,
      "rewards/semantic_correctness_reward_func/mean": 0.4861953556537628,
      "rewards/semantic_correctness_reward_func/std": 0.2150430679321289,
      "rewards/xmlcount_reward_func/mean": 0.67253577709198,
      "rewards/xmlcount_reward_func/std": 0.46746620535850525,
      "step": 413
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 327.0,
      "completions/mean_length": 156.25,
      "completions/mean_terminated_length": 148.43243408203125,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.14135120139985488,
      "grad_norm": 0.022351212799549103,
      "kl": 0.01930093765258789,
      "learning_rate": 7.038717044938519e-06,
      "loss": -0.0056,
      "num_tokens": 147669662.0,
      "reward": 0.48152732849121094,
      "reward_std": 0.07369009405374527,
      "rewards/gemini_judge_reward_func/mean": 0.1830357164144516,
      "rewards/gemini_judge_reward_func/std": 0.3013090491294861,
      "rewards/semantic_correctness_reward_func/mean": 0.4685649275779724,
      "rewards/semantic_correctness_reward_func/std": 0.23053161799907684,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 414
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 837.0,
      "completions/mean_length": 161.6607208251953,
      "completions/mean_terminated_length": 141.97259521484375,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.14169262942255986,
      "grad_norm": 0.023213328793644905,
      "kl": 0.02099323272705078,
      "learning_rate": 7.023608149028936e-06,
      "loss": -0.0126,
      "num_tokens": 148012226.0,
      "reward": 0.44395384192466736,
      "reward_std": 0.05156712979078293,
      "rewards/gemini_judge_reward_func/mean": 0.1607142835855484,
      "rewards/gemini_judge_reward_func/std": 0.28283706307411194,
      "rewards/semantic_correctness_reward_func/mean": 0.4683404564857483,
      "rewards/semantic_correctness_reward_func/std": 0.21465305984020233,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 415
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 309.0,
      "completions/mean_length": 149.7857208251953,
      "completions/mean_terminated_length": 141.909912109375,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.14203405744526482,
      "grad_norm": 0.020580802112817764,
      "kl": 0.017251014709472656,
      "learning_rate": 7.008477123264849e-06,
      "loss": -0.0101,
      "num_tokens": 148353486.0,
      "reward": 0.480153888463974,
      "reward_std": 0.08174009621143341,
      "rewards/gemini_judge_reward_func/mean": 0.1796875,
      "rewards/gemini_judge_reward_func/std": 0.29536840319633484,
      "rewards/semantic_correctness_reward_func/mean": 0.5041443705558777,
      "rewards/semantic_correctness_reward_func/std": 0.2075480967760086,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 416
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 315.0,
      "completions/mean_length": 161.33929443359375,
      "completions/mean_terminated_length": 141.64382934570312,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.14237548546796977,
      "grad_norm": 0.02087719924747944,
      "kl": 0.020658493041992188,
      "learning_rate": 6.993324133116726e-06,
      "loss": -0.0102,
      "num_tokens": 148723782.0,
      "reward": 0.432353675365448,
      "reward_std": 0.05930045619606972,
      "rewards/gemini_judge_reward_func/mean": 0.1316964328289032,
      "rewards/gemini_judge_reward_func/std": 0.2667270004749298,
      "rewards/semantic_correctness_reward_func/mean": 0.4326254427433014,
      "rewards/semantic_correctness_reward_func/std": 0.2124425172805786,
      "rewards/xmlcount_reward_func/mean": 0.7328750491142273,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 417
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0401785714285714,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 869.0,
      "completions/mean_length": 184.37501525878906,
      "completions/mean_terminated_length": 149.2279052734375,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.14271691349067475,
      "grad_norm": 0.02160128392279148,
      "kl": 0.01466989517211914,
      "learning_rate": 6.978149344295242e-06,
      "loss": 0.0137,
      "num_tokens": 149068342.0,
      "reward": 0.45865893363952637,
      "reward_std": 0.07876806706190109,
      "rewards/gemini_judge_reward_func/mean": 0.1473214328289032,
      "rewards/gemini_judge_reward_func/std": 0.2562088966369629,
      "rewards/semantic_correctness_reward_func/mean": 0.425651490688324,
      "rewards/semantic_correctness_reward_func/std": 0.21036946773529053,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 418
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 881.0,
      "completions/mean_length": 175.50894165039062,
      "completions/mean_terminated_length": 148.13824462890625,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.1430583415133797,
      "grad_norm": 0.032436758279800415,
      "kl": 0.022993087768554688,
      "learning_rate": 6.9629529227494575e-06,
      "loss": 0.0047,
      "num_tokens": 149440944.0,
      "reward": 0.4210297167301178,
      "reward_std": 0.06524720042943954,
      "rewards/gemini_judge_reward_func/mean": 0.1205357164144516,
      "rewards/gemini_judge_reward_func/std": 0.2646643817424774,
      "rewards/semantic_correctness_reward_func/mean": 0.41618412733078003,
      "rewards/semantic_correctness_reward_func/std": 0.21945932507514954,
      "rewards/xmlcount_reward_func/mean": 0.7239465117454529,
      "rewards/xmlcount_reward_func/std": 0.4438221752643585,
      "step": 419
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0357142857142857,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 182.75001525878906,
      "completions/mean_terminated_length": 151.59259033203125,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.1433997695360847,
      "grad_norm": 0.019419504329562187,
      "kl": 0.013905525207519531,
      "learning_rate": 6.9477350346650016e-06,
      "loss": -0.0283,
      "num_tokens": 149800156.0,
      "reward": 0.4564591348171234,
      "reward_std": 0.06576818972826004,
      "rewards/gemini_judge_reward_func/mean": 0.1863839328289032,
      "rewards/gemini_judge_reward_func/std": 0.273295521736145,
      "rewards/semantic_correctness_reward_func/mean": 0.4437777101993561,
      "rewards/semantic_correctness_reward_func/std": 0.23594844341278076,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 420
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.044642857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 998.0,
      "completions/mean_length": 190.56251525878906,
      "completions/mean_terminated_length": 151.6168212890625,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.14374119755878964,
      "grad_norm": 0.022768495604395866,
      "kl": 0.02614879608154297,
      "learning_rate": 6.932495846462262e-06,
      "loss": 0.0009,
      "num_tokens": 150198454.0,
      "reward": 0.3712509572505951,
      "reward_std": 0.06766778230667114,
      "rewards/gemini_judge_reward_func/mean": 0.1283482164144516,
      "rewards/gemini_judge_reward_func/std": 0.22829538583755493,
      "rewards/semantic_correctness_reward_func/mean": 0.4421386122703552,
      "rewards/semantic_correctness_reward_func/std": 0.18618284165859222,
      "rewards/xmlcount_reward_func/mean": 0.5787099003791809,
      "rewards/xmlcount_reward_func/std": 0.49204617738723755,
      "step": 421
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 695.0,
      "completions/mean_length": 160.7366180419922,
      "completions/mean_terminated_length": 149.0181121826172,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.1440826255814946,
      "grad_norm": 0.02022860199213028,
      "kl": 0.017331600189208984,
      "learning_rate": 6.9172355247945586e-06,
      "loss": -0.0178,
      "num_tokens": 150555467.0,
      "reward": 0.45713624358177185,
      "reward_std": 0.06998570263385773,
      "rewards/gemini_judge_reward_func/mean": 0.1529017835855484,
      "rewards/gemini_judge_reward_func/std": 0.2817355990409851,
      "rewards/semantic_correctness_reward_func/mean": 0.4426274299621582,
      "rewards/semantic_correctness_reward_func/std": 0.22578133642673492,
      "rewards/xmlcount_reward_func/mean": 0.7686249613761902,
      "rewards/xmlcount_reward_func/std": 0.42356839776039124,
      "step": 422
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 951.0,
      "completions/mean_length": 170.2366180419922,
      "completions/mean_terminated_length": 158.64706420898438,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.14442405360419958,
      "grad_norm": 0.019036274403333664,
      "kl": 0.019074440002441406,
      "learning_rate": 6.901954236546324e-06,
      "loss": 0.0253,
      "num_tokens": 150930284.0,
      "reward": 0.4317302107810974,
      "reward_std": 0.0508720763027668,
      "rewards/gemini_judge_reward_func/mean": 0.1305803507566452,
      "rewards/gemini_judge_reward_func/std": 0.22453762590885162,
      "rewards/semantic_correctness_reward_func/mean": 0.4317401349544525,
      "rewards/semantic_correctness_reward_func/std": 0.19227589666843414,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 423
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0357142857142857,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 736.0,
      "completions/mean_length": 182.78126525878906,
      "completions/mean_terminated_length": 151.625,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.14476548162690453,
      "grad_norm": 0.019461285322904587,
      "kl": 0.018743515014648438,
      "learning_rate": 6.88665214883128e-06,
      "loss": 0.0233,
      "num_tokens": 151300231.0,
      "reward": 0.41627252101898193,
      "reward_std": 0.07595758885145187,
      "rewards/gemini_judge_reward_func/mean": 0.1383928507566452,
      "rewards/gemini_judge_reward_func/std": 0.2579222023487091,
      "rewards/semantic_correctness_reward_func/mean": 0.4460768401622772,
      "rewards/semantic_correctness_reward_func/std": 0.19947165250778198,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 424
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 942.0,
      "completions/mean_length": 164.13839721679688,
      "completions/mean_terminated_length": 156.3918914794922,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.14510690964960948,
      "grad_norm": 0.01971801184117794,
      "kl": 0.01684427261352539,
      "learning_rate": 6.871329428990602e-06,
      "loss": -0.003,
      "num_tokens": 151681530.0,
      "reward": 0.4248278737068176,
      "reward_std": 0.06656418740749359,
      "rewards/gemini_judge_reward_func/mean": 0.1294642835855484,
      "rewards/gemini_judge_reward_func/std": 0.27094367146492004,
      "rewards/semantic_correctness_reward_func/mean": 0.41733548045158386,
      "rewards/semantic_correctness_reward_func/std": 0.21749421954154968,
      "rewards/xmlcount_reward_func/mean": 0.7239375710487366,
      "rewards/xmlcount_reward_func/std": 0.4488601088523865,
      "step": 425
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 652.0,
      "completions/mean_length": 167.9419708251953,
      "completions/mean_terminated_length": 156.32127380371094,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.14544833767231446,
      "grad_norm": 0.019281448796391487,
      "kl": 0.017251014709472656,
      "learning_rate": 6.855986244591104e-06,
      "loss": 0.0054,
      "num_tokens": 152040889.0,
      "reward": 0.45101961493492126,
      "reward_std": 0.07227209955453873,
      "rewards/gemini_judge_reward_func/mean": 0.1551339328289032,
      "rewards/gemini_judge_reward_func/std": 0.2805072069168091,
      "rewards/semantic_correctness_reward_func/mean": 0.46121397614479065,
      "rewards/semantic_correctness_reward_func/std": 0.2255532592535019,
      "rewards/xmlcount_reward_func/mean": 0.7418080568313599,
      "rewards/xmlcount_reward_func/std": 0.4394637644290924,
      "step": 426
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 967.0,
      "completions/mean_length": 164.71429443359375,
      "completions/mean_terminated_length": 153.04977416992188,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.14578976569501942,
      "grad_norm": 0.01867520622909069,
      "kl": 0.018479347229003906,
      "learning_rate": 6.840622763423391e-06,
      "loss": 0.0091,
      "num_tokens": 152378793.0,
      "reward": 0.4158693850040436,
      "reward_std": 0.05586162954568863,
      "rewards/gemini_judge_reward_func/mean": 0.1104910746216774,
      "rewards/gemini_judge_reward_func/std": 0.20993109047412872,
      "rewards/semantic_correctness_reward_func/mean": 0.4283645451068878,
      "rewards/semantic_correctness_reward_func/std": 0.20962867140769958,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 427
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 601.0,
      "completions/mean_length": 169.82589721679688,
      "completions/mean_terminated_length": 158.23077392578125,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.14613119371772437,
      "grad_norm": 0.02032027207314968,
      "kl": 0.017673969268798828,
      "learning_rate": 6.825239153500029e-06,
      "loss": -0.0278,
      "num_tokens": 152725418.0,
      "reward": 0.4743519127368927,
      "reward_std": 0.0713319256901741,
      "rewards/gemini_judge_reward_func/mean": 0.1417410671710968,
      "rewards/gemini_judge_reward_func/std": 0.26838013529777527,
      "rewards/semantic_correctness_reward_func/mean": 0.4437771439552307,
      "rewards/semantic_correctness_reward_func/std": 0.21585533022880554,
      "rewards/xmlcount_reward_func/mean": 0.8222500085830688,
      "rewards/xmlcount_reward_func/std": 0.3842346966266632,
      "step": 428
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 963.0,
      "completions/mean_length": 164.98214721679688,
      "completions/mean_terminated_length": 157.2432403564453,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.14647262174042935,
      "grad_norm": 0.02102663740515709,
      "kl": 0.017747879028320312,
      "learning_rate": 6.809835583053716e-06,
      "loss": -0.0108,
      "num_tokens": 153084938.0,
      "reward": 0.46583712100982666,
      "reward_std": 0.0666920468211174,
      "rewards/gemini_judge_reward_func/mean": 0.1283482164144516,
      "rewards/gemini_judge_reward_func/std": 0.2437330037355423,
      "rewards/semantic_correctness_reward_func/mean": 0.4279892146587372,
      "rewards/semantic_correctness_reward_func/std": 0.207007497549057,
      "rewards/xmlcount_reward_func/mean": 0.8222500681877136,
      "rewards/xmlcount_reward_func/std": 0.3842346966266632,
      "step": 429
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 183.44644165039062,
      "completions/mean_terminated_length": 160.31192016601562,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.1468140497631343,
      "grad_norm": 0.019378382712602615,
      "kl": 0.017717361450195312,
      "learning_rate": 6.794412220535426e-06,
      "loss": -0.0137,
      "num_tokens": 153469126.0,
      "reward": 0.4423011839389801,
      "reward_std": 0.067063108086586,
      "rewards/gemini_judge_reward_func/mean": 0.1674107164144516,
      "rewards/gemini_judge_reward_func/std": 0.2878098487854004,
      "rewards/semantic_correctness_reward_func/mean": 0.46568432450294495,
      "rewards/semantic_correctness_reward_func/std": 0.2046249806880951,
      "rewards/xmlcount_reward_func/mean": 0.705500066280365,
      "rewards/xmlcount_reward_func/std": 0.45444735884666443,
      "step": 430
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0401785714285714,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 746.0,
      "completions/mean_length": 191.10714721679688,
      "completions/mean_terminated_length": 156.2418670654297,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "epoch": 0.14715547778583926,
      "grad_norm": 0.01916058361530304,
      "kl": 0.017314910888671875,
      "learning_rate": 6.778969234612583e-06,
      "loss": -0.0471,
      "num_tokens": 153827938.0,
      "reward": 0.4171583354473114,
      "reward_std": 0.05001484602689743,
      "rewards/gemini_judge_reward_func/mean": 0.1127232164144516,
      "rewards/gemini_judge_reward_func/std": 0.2601417303085327,
      "rewards/semantic_correctness_reward_func/mean": 0.4303452670574188,
      "rewards/semantic_correctness_reward_func/std": 0.21427829563617706,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 431
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 703.0,
      "completions/mean_length": 169.38394165039062,
      "completions/mean_terminated_length": 153.84544372558594,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.14749690580854424,
      "grad_norm": 0.02057456597685814,
      "kl": 0.016883373260498047,
      "learning_rate": 6.763506794167207e-06,
      "loss": -0.0115,
      "num_tokens": 154183800.0,
      "reward": 0.4381869435310364,
      "reward_std": 0.0673794075846672,
      "rewards/gemini_judge_reward_func/mean": 0.1138392835855484,
      "rewards/gemini_judge_reward_func/std": 0.22432857751846313,
      "rewards/semantic_correctness_reward_func/mean": 0.4260060787200928,
      "rewards/semantic_correctness_reward_func/std": 0.19526949524879456,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 432
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.044642857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 201.54019165039062,
      "completions/mean_terminated_length": 163.1074676513672,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.1478383338312492,
      "grad_norm": 0.019249025732278824,
      "kl": 0.019617557525634766,
      "learning_rate": 6.748025068294067e-06,
      "loss": -0.0242,
      "num_tokens": 154560981.0,
      "reward": 0.3731415867805481,
      "reward_std": 0.05926031246781349,
      "rewards/gemini_judge_reward_func/mean": 0.1082589253783226,
      "rewards/gemini_judge_reward_func/std": 0.22397971153259277,
      "rewards/semantic_correctness_reward_func/mean": 0.4336898624897003,
      "rewards/semantic_correctness_reward_func/std": 0.2114102691411972,
      "rewards/xmlcount_reward_func/mean": 0.6077500581741333,
      "rewards/xmlcount_reward_func/std": 0.48996880650520325,
      "step": 433
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 700.0,
      "completions/mean_length": 165.08929443359375,
      "completions/mean_terminated_length": 153.42987060546875,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.14817976185395415,
      "grad_norm": 0.021220970898866653,
      "kl": 0.019231796264648438,
      "learning_rate": 6.732524226298841e-06,
      "loss": -0.0126,
      "num_tokens": 154907193.0,
      "reward": 0.45769333839416504,
      "reward_std": 0.060595184564590454,
      "rewards/gemini_judge_reward_func/mean": 0.1350446492433548,
      "rewards/gemini_judge_reward_func/std": 0.23772463202476501,
      "rewards/semantic_correctness_reward_func/mean": 0.44537705183029175,
      "rewards/semantic_correctness_reward_func/std": 0.20738016068935394,
      "rewards/xmlcount_reward_func/mean": 0.786500096321106,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 434
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 759.0,
      "completions/mean_length": 170.7678680419922,
      "completions/mean_terminated_length": 147.28439331054688,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.14852118987665913,
      "grad_norm": 0.020631877705454826,
      "kl": 0.018312454223632812,
      "learning_rate": 6.717004437696249e-06,
      "loss": -0.0163,
      "num_tokens": 155285817.0,
      "reward": 0.4152604341506958,
      "reward_std": 0.06980551779270172,
      "rewards/gemini_judge_reward_func/mean": 0.1662946492433548,
      "rewards/gemini_judge_reward_func/std": 0.30590617656707764,
      "rewards/semantic_correctness_reward_func/mean": 0.4470253586769104,
      "rewards/semantic_correctness_reward_func/std": 0.22271786630153656,
      "rewards/xmlcount_reward_func/mean": 0.6483437418937683,
      "rewards/xmlcount_reward_func/std": 0.4749422073364258,
      "step": 435
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 380.0,
      "completions/mean_length": 169.83482360839844,
      "completions/mean_terminated_length": 146.32568359375,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.14886261789936409,
      "grad_norm": 0.019136464223265648,
      "kl": 0.020123004913330078,
      "learning_rate": 6.701465872208216e-06,
      "loss": 0.0038,
      "num_tokens": 155641420.0,
      "reward": 0.4250961244106293,
      "reward_std": 0.06240704655647278,
      "rewards/gemini_judge_reward_func/mean": 0.1138392835855484,
      "rewards/gemini_judge_reward_func/std": 0.2141004502773285,
      "rewards/semantic_correctness_reward_func/mean": 0.4320519268512726,
      "rewards/semantic_correctness_reward_func/std": 0.1874925047159195,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 436
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0535714285714286,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 992.0,
      "completions/mean_length": 195.6741180419922,
      "completions/mean_terminated_length": 148.78773498535156,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.14920404592206907,
      "grad_norm": 0.020079955458641052,
      "kl": 0.019093990325927734,
      "learning_rate": 6.685908699762003e-06,
      "loss": 0.0153,
      "num_tokens": 156023519.0,
      "reward": 0.407644659280777,
      "reward_std": 0.07643434405326843,
      "rewards/gemini_judge_reward_func/mean": 0.1517857164144516,
      "rewards/gemini_judge_reward_func/std": 0.2706849277019501,
      "rewards/semantic_correctness_reward_func/mean": 0.4476517140865326,
      "rewards/semantic_correctness_reward_func/std": 0.20519518852233887,
      "rewards/xmlcount_reward_func/mean": 0.643500030040741,
      "rewards/xmlcount_reward_func/std": 0.48071080446243286,
      "step": 437
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0357142857142857,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 645.0,
      "completions/mean_length": 186.77232360839844,
      "completions/mean_terminated_length": 155.76388549804688,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.14954547394477402,
      "grad_norm": 0.019271496683359146,
      "kl": 0.01635265350341797,
      "learning_rate": 6.670333090488357e-06,
      "loss": -0.0304,
      "num_tokens": 156369764.0,
      "reward": 0.423153817653656,
      "reward_std": 0.05922694131731987,
      "rewards/gemini_judge_reward_func/mean": 0.09375,
      "rewards/gemini_judge_reward_func/std": 0.18679800629615784,
      "rewards/semantic_correctness_reward_func/mean": 0.4267689883708954,
      "rewards/semantic_correctness_reward_func/std": 0.19564877450466156,
      "rewards/xmlcount_reward_func/mean": 0.7507500648498535,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 438
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 997.0,
      "completions/mean_length": 177.63394165039062,
      "completions/mean_terminated_length": 170.00901794433594,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.14988690196747897,
      "grad_norm": 0.019324006512761116,
      "kl": 0.016142845153808594,
      "learning_rate": 6.654739214719642e-06,
      "loss": -0.0193,
      "num_tokens": 156725898.0,
      "reward": 0.4374414384365082,
      "reward_std": 0.06267654150724411,
      "rewards/gemini_judge_reward_func/mean": 0.0948660746216774,
      "rewards/gemini_judge_reward_func/std": 0.19719554483890533,
      "rewards/semantic_correctness_reward_func/mean": 0.4211089313030243,
      "rewards/semantic_correctness_reward_func/std": 0.20056340098381042,
      "rewards/xmlcount_reward_func/mean": 0.7881830930709839,
      "rewards/xmlcount_reward_func/std": 0.40579503774642944,
      "step": 439
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 605.0,
      "completions/mean_length": 168.65625,
      "completions/mean_terminated_length": 149.1278533935547,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.15022832999018396,
      "grad_norm": 0.02156071364879608,
      "kl": 0.0205535888671875,
      "learning_rate": 6.6391272429879886e-06,
      "loss": -0.005,
      "num_tokens": 157090797.0,
      "reward": 0.41712579131126404,
      "reward_std": 0.07244788855314255,
      "rewards/gemini_judge_reward_func/mean": 0.1171875,
      "rewards/gemini_judge_reward_func/std": 0.2515541613101959,
      "rewards/semantic_correctness_reward_func/mean": 0.40336090326309204,
      "rewards/semantic_correctness_reward_func/std": 0.21470442414283752,
      "rewards/xmlcount_reward_func/mean": 0.7239464521408081,
      "rewards/xmlcount_reward_func/std": 0.44382214546203613,
      "step": 440
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 441.0,
      "completions/mean_length": 162.4553680419922,
      "completions/mean_terminated_length": 154.69369506835938,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.1505697580128889,
      "grad_norm": 0.020700950175523758,
      "kl": 0.01773834228515625,
      "learning_rate": 6.6234973460234184e-06,
      "loss": -0.0183,
      "num_tokens": 157448539.0,
      "reward": 0.4645825922489166,
      "reward_std": 0.08551337569952011,
      "rewards/gemini_judge_reward_func/mean": 0.1584821492433548,
      "rewards/gemini_judge_reward_func/std": 0.28606563806533813,
      "rewards/semantic_correctness_reward_func/mean": 0.4790647327899933,
      "rewards/semantic_correctness_reward_func/std": 0.23252920806407928,
      "rewards/xmlcount_reward_func/mean": 0.7634419798851013,
      "rewards/xmlcount_reward_func/std": 0.4243088662624359,
      "step": 441
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 834.0,
      "completions/mean_length": 169.44644165039062,
      "completions/mean_terminated_length": 149.93606567382812,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.15091118603559386,
      "grad_norm": 0.02045726776123047,
      "kl": 0.016900062561035156,
      "learning_rate": 6.607849694751978e-06,
      "loss": -0.0017,
      "num_tokens": 157800091.0,
      "reward": 0.4241785407066345,
      "reward_std": 0.06672972440719604,
      "rewards/gemini_judge_reward_func/mean": 0.1205357164144516,
      "rewards/gemini_judge_reward_func/std": 0.25163623690605164,
      "rewards/semantic_correctness_reward_func/mean": 0.45698192715644836,
      "rewards/semantic_correctness_reward_func/std": 0.2180858850479126,
      "rewards/xmlcount_reward_func/mean": 0.7114196419715881,
      "rewards/xmlcount_reward_func/std": 0.45252570509910583,
      "step": 442
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 184.04019165039062,
      "completions/mean_terminated_length": 160.92201232910156,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.15125261405829885,
      "grad_norm": 0.020864713937044144,
      "kl": 0.020277023315429688,
      "learning_rate": 6.592184460293878e-06,
      "loss": -0.0066,
      "num_tokens": 158173060.0,
      "reward": 0.3883172571659088,
      "reward_std": 0.051999613642692566,
      "rewards/gemini_judge_reward_func/mean": 0.0825892835855484,
      "rewards/gemini_judge_reward_func/std": 0.20335854589939117,
      "rewards/semantic_correctness_reward_func/mean": 0.3843896985054016,
      "rewards/semantic_correctness_reward_func/std": 0.20913942158222198,
      "rewards/xmlcount_reward_func/mean": 0.6960089802742004,
      "rewards/xmlcount_reward_func/std": 0.46085411310195923,
      "step": 443
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 755.0,
      "completions/mean_length": 176.4553680419922,
      "completions/mean_terminated_length": 157.10501098632812,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.1515940420810038,
      "grad_norm": 0.021549487486481667,
      "kl": 0.017778396606445312,
      "learning_rate": 6.576501813961609e-06,
      "loss": 0.0159,
      "num_tokens": 158517066.0,
      "reward": 0.44117918610572815,
      "reward_std": 0.07043396681547165,
      "rewards/gemini_judge_reward_func/mean": 0.1283482164144516,
      "rewards/gemini_judge_reward_func/std": 0.22706440091133118,
      "rewards/semantic_correctness_reward_func/mean": 0.4476993978023529,
      "rewards/semantic_correctness_reward_func/std": 0.2108486443758011,
      "rewards/xmlcount_reward_func/mean": 0.7507500648498535,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 444
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 445.0,
      "completions/mean_length": 172.6607208251953,
      "completions/mean_terminated_length": 153.22373962402344,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.15193547010370875,
      "grad_norm": 0.019931474700570107,
      "kl": 0.01573467254638672,
      "learning_rate": 6.560801927258081e-06,
      "loss": -0.0191,
      "num_tokens": 158853982.0,
      "reward": 0.4490154981613159,
      "reward_std": 0.06086720898747444,
      "rewards/gemini_judge_reward_func/mean": 0.0825892835855484,
      "rewards/gemini_judge_reward_func/std": 0.1977689266204834,
      "rewards/semantic_correctness_reward_func/mean": 0.41752371191978455,
      "rewards/semantic_correctness_reward_func/std": 0.18293553590774536,
      "rewards/xmlcount_reward_func/mean": 0.8311875462532043,
      "rewards/xmlcount_reward_func/std": 0.3765355050563812,
      "step": 445
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 729.0,
      "completions/mean_length": 159.55804443359375,
      "completions/mean_terminated_length": 155.68162536621094,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.15227689812641373,
      "grad_norm": 0.021299151703715324,
      "kl": 0.017287254333496094,
      "learning_rate": 6.545084971874738e-06,
      "loss": 0.0017,
      "num_tokens": 159219791.0,
      "reward": 0.431675523519516,
      "reward_std": 0.06107047200202942,
      "rewards/gemini_judge_reward_func/mean": 0.1082589253783226,
      "rewards/gemini_judge_reward_func/std": 0.2124195545911789,
      "rewards/semantic_correctness_reward_func/mean": 0.44035953283309937,
      "rewards/semantic_correctness_reward_func/std": 0.21529170870780945,
      "rewards/xmlcount_reward_func/mean": 0.7507500648498535,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 446
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0357142857142857,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 734.0,
      "completions/mean_length": 185.9241180419922,
      "completions/mean_terminated_length": 154.88426208496094,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.1526183261491187,
      "grad_norm": 0.019747041165828705,
      "kl": 0.015558242797851562,
      "learning_rate": 6.529351119689687e-06,
      "loss": -0.0098,
      "num_tokens": 159573474.0,
      "reward": 0.4196682274341583,
      "reward_std": 0.059854909777641296,
      "rewards/gemini_judge_reward_func/mean": 0.1116071417927742,
      "rewards/gemini_judge_reward_func/std": 0.21397185325622559,
      "rewards/semantic_correctness_reward_func/mean": 0.4272516667842865,
      "rewards/semantic_correctness_reward_func/std": 0.20560157299041748,
      "rewards/xmlcount_reward_func/mean": 0.7239375710487366,
      "rewards/xmlcount_reward_func/std": 0.4488601088523865,
      "step": 447
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 615.0,
      "completions/mean_length": 165.14732360839844,
      "completions/mean_terminated_length": 149.5318145751953,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.15295975417182364,
      "grad_norm": 0.021277599036693573,
      "kl": 0.02167987823486328,
      "learning_rate": 6.513600542765816e-06,
      "loss": -0.0232,
      "num_tokens": 159928331.0,
      "reward": 0.4534654915332794,
      "reward_std": 0.07899215072393417,
      "rewards/gemini_judge_reward_func/mean": 0.1584821492433548,
      "rewards/gemini_judge_reward_func/std": 0.2688947916030884,
      "rewards/semantic_correctness_reward_func/mean": 0.45780062675476074,
      "rewards/semantic_correctness_reward_func/std": 0.2195635586977005,
      "rewards/xmlcount_reward_func/mean": 0.7462812662124634,
      "rewards/xmlcount_reward_func/std": 0.4369716942310333,
      "step": 448
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 494.0,
      "completions/mean_length": 147.91519165039062,
      "completions/mean_terminated_length": 140.02252197265625,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.15330118219452862,
      "grad_norm": 0.02262871339917183,
      "kl": 0.02004528045654297,
      "learning_rate": 6.49783341334891e-06,
      "loss": -0.0262,
      "num_tokens": 160286236.0,
      "reward": 0.41993653774261475,
      "reward_std": 0.07568960636854172,
      "rewards/gemini_judge_reward_func/mean": 0.1361607164144516,
      "rewards/gemini_judge_reward_func/std": 0.22804586589336395,
      "rewards/semantic_correctness_reward_func/mean": 0.4688611328601837,
      "rewards/semantic_correctness_reward_func/std": 0.19843356311321259,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 449
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0357142857142857,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 996.0,
      "completions/mean_length": 179.52679443359375,
      "completions/mean_terminated_length": 148.25,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.15364261021723358,
      "grad_norm": 0.019460123032331467,
      "kl": 0.021003246307373047,
      "learning_rate": 6.4820499038657695e-06,
      "loss": 0.0156,
      "num_tokens": 160678002.0,
      "reward": 0.3975132405757904,
      "reward_std": 0.060600072145462036,
      "rewards/gemini_judge_reward_func/mean": 0.1439732164144516,
      "rewards/gemini_judge_reward_func/std": 0.27340537309646606,
      "rewards/semantic_correctness_reward_func/mean": 0.4305214285850525,
      "rewards/semantic_correctness_reward_func/std": 0.2286413013935089,
      "rewards/xmlcount_reward_func/mean": 0.6345491409301758,
      "rewards/xmlcount_reward_func/std": 0.48328086733818054,
      "step": 450
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 786.0,
      "completions/mean_length": 165.75894165039062,
      "completions/mean_terminated_length": 158.0270233154297,
      "completions/min_length": 45.0,
      "completions/min_terminated_length": 45.0,
      "epoch": 0.15398403823993853,
      "grad_norm": 0.020578309893608093,
      "kl": 0.017749786376953125,
      "learning_rate": 6.466250186922325e-06,
      "loss": -0.0127,
      "num_tokens": 161026600.0,
      "reward": 0.4278218448162079,
      "reward_std": 0.055558666586875916,
      "rewards/gemini_judge_reward_func/mean": 0.0915178582072258,
      "rewards/gemini_judge_reward_func/std": 0.1952219009399414,
      "rewards/semantic_correctness_reward_func/mean": 0.3830733299255371,
      "rewards/semantic_correctness_reward_func/std": 0.19349335134029388,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 451
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 736.0,
      "completions/mean_length": 170.3125,
      "completions/mean_terminated_length": 154.79090881347656,
      "completions/min_length": 41.0,
      "completions/min_terminated_length": 41.0,
      "epoch": 0.1543254662626435,
      "grad_norm": 0.01939302682876587,
      "kl": 0.016997814178466797,
      "learning_rate": 6.450434435301751e-06,
      "loss": -0.0112,
      "num_tokens": 161412190.0,
      "reward": 0.4270239770412445,
      "reward_std": 0.0703679621219635,
      "rewards/gemini_judge_reward_func/mean": 0.1104910746216774,
      "rewards/gemini_judge_reward_func/std": 0.21520504355430603,
      "rewards/semantic_correctness_reward_func/mean": 0.45732492208480835,
      "rewards/semantic_correctness_reward_func/std": 0.20194603502750397,
      "rewards/xmlcount_reward_func/mean": 0.7284063100814819,
      "rewards/xmlcount_reward_func/std": 0.4465976059436798,
      "step": 452
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 502.0,
      "completions/mean_length": 164.73214721679688,
      "completions/mean_terminated_length": 153.06788635253906,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "epoch": 0.15466689428534847,
      "grad_norm": 0.02077455259859562,
      "kl": 0.015947818756103516,
      "learning_rate": 6.434602821962571e-06,
      "loss": -0.0275,
      "num_tokens": 161741334.0,
      "reward": 0.47266238927841187,
      "reward_std": 0.07710576057434082,
      "rewards/gemini_judge_reward_func/mean": 0.1462053507566452,
      "rewards/gemini_judge_reward_func/std": 0.2793271541595459,
      "rewards/semantic_correctness_reward_func/mean": 0.4264009892940521,
      "rewards/semantic_correctness_reward_func/std": 0.2282664179801941,
      "rewards/xmlcount_reward_func/mean": 0.8222500681877136,
      "rewards/xmlcount_reward_func/std": 0.3842346966266632,
      "step": 453
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 557.0,
      "completions/mean_length": 171.4732208251953,
      "completions/mean_terminated_length": 152.00912475585938,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.15500832230805342,
      "grad_norm": 0.019553987309336662,
      "kl": 0.018682479858398438,
      "learning_rate": 6.418755520036775e-06,
      "loss": -0.0309,
      "num_tokens": 162107304.0,
      "reward": 0.4157797396183014,
      "reward_std": 0.054216425865888596,
      "rewards/gemini_judge_reward_func/mean": 0.0814732164144516,
      "rewards/gemini_judge_reward_func/std": 0.17334242165088654,
      "rewards/semantic_correctness_reward_func/mean": 0.45020225644111633,
      "rewards/semantic_correctness_reward_func/std": 0.2048385888338089,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 454
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 357.0,
      "completions/mean_length": 167.2991180419922,
      "completions/mean_terminated_length": 147.73971557617188,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "epoch": 0.1553497503307584,
      "grad_norm": 0.021014470607042313,
      "kl": 0.019474029541015625,
      "learning_rate": 6.402892702827916e-06,
      "loss": 0.015,
      "num_tokens": 162469799.0,
      "reward": 0.4399973154067993,
      "reward_std": 0.08108548820018768,
      "rewards/gemini_judge_reward_func/mean": 0.1584821492433548,
      "rewards/gemini_judge_reward_func/std": 0.2688947916030884,
      "rewards/semantic_correctness_reward_func/mean": 0.4530222713947296,
      "rewards/semantic_correctness_reward_func/std": 0.19590409100055695,
      "rewards/xmlcount_reward_func/mean": 0.7150000929832458,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 455
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 533.0,
      "completions/mean_length": 163.92857360839844,
      "completions/mean_terminated_length": 152.25340270996094,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.15569117835346336,
      "grad_norm": 0.021115519106388092,
      "kl": 0.02216339111328125,
      "learning_rate": 6.387014543809224e-06,
      "loss": 0.0144,
      "num_tokens": 162832799.0,
      "reward": 0.43464815616607666,
      "reward_std": 0.07387977093458176,
      "rewards/gemini_judge_reward_func/mean": 0.1339285671710968,
      "rewards/gemini_judge_reward_func/std": 0.24815140664577484,
      "rewards/semantic_correctness_reward_func/mean": 0.41505321860313416,
      "rewards/semantic_correctness_reward_func/std": 0.195115327835083,
      "rewards/xmlcount_reward_func/mean": 0.745165228843689,
      "rewards/xmlcount_reward_func/std": 0.43663734197616577,
      "step": 456
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 745.0,
      "completions/mean_length": 158.75894165039062,
      "completions/mean_terminated_length": 154.87893676757812,
      "completions/min_length": 48.0,
      "completions/min_terminated_length": 48.0,
      "epoch": 0.15603260637616834,
      "grad_norm": 0.01938485912978649,
      "kl": 0.01820230484008789,
      "learning_rate": 6.371121216621698e-06,
      "loss": 0.0089,
      "num_tokens": 163185189.0,
      "reward": 0.4345279633998871,
      "reward_std": 0.056550104171037674,
      "rewards/gemini_judge_reward_func/mean": 0.1428571492433548,
      "rewards/gemini_judge_reward_func/std": 0.2288455367088318,
      "rewards/semantic_correctness_reward_func/mean": 0.4211753010749817,
      "rewards/semantic_correctness_reward_func/std": 0.16808317601680756,
      "rewards/xmlcount_reward_func/mean": 0.7328750491142273,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 457
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 392.0,
      "completions/mean_length": 161.32589721679688,
      "completions/mean_terminated_length": 149.61538696289062,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.1563740343988733,
      "grad_norm": 0.02125578746199608,
      "kl": 0.01828622817993164,
      "learning_rate": 6.355212895072223e-06,
      "loss": 0.003,
      "num_tokens": 163542542.0,
      "reward": 0.44018012285232544,
      "reward_std": 0.05906569957733154,
      "rewards/gemini_judge_reward_func/mean": 0.1305803507566452,
      "rewards/gemini_judge_reward_func/std": 0.24711813032627106,
      "rewards/semantic_correctness_reward_func/mean": 0.42258793115615845,
      "rewards/semantic_correctness_reward_func/std": 0.20304962992668152,
      "rewards/xmlcount_reward_func/mean": 0.7585759162902832,
      "rewards/xmlcount_reward_func/std": 0.4274976849555969,
      "step": 458
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 810.0,
      "completions/mean_length": 163.1607208251953,
      "completions/mean_terminated_length": 155.40541076660156,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.15671546242157824,
      "grad_norm": 0.021676773205399513,
      "kl": 0.0220947265625,
      "learning_rate": 6.339289753131649e-06,
      "loss": -0.0109,
      "num_tokens": 163883274.0,
      "reward": 0.4686073660850525,
      "reward_std": 0.06618095934391022,
      "rewards/gemini_judge_reward_func/mean": 0.1361607164144516,
      "rewards/gemini_judge_reward_func/std": 0.2457905411720276,
      "rewards/semantic_correctness_reward_func/mean": 0.4619653820991516,
      "rewards/semantic_correctness_reward_func/std": 0.18577620387077332,
      "rewards/xmlcount_reward_func/mean": 0.8043750524520874,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 459
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0580357142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 825.0,
      "completions/mean_length": 212.57589721679688,
      "completions/mean_terminated_length": 162.58294677734375,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.15705689044428323,
      "grad_norm": 0.017990613356232643,
      "kl": 0.01677417755126953,
      "learning_rate": 6.323351964932909e-06,
      "loss": -0.009,
      "num_tokens": 164261219.0,
      "reward": 0.426357239484787,
      "reward_std": 0.05332663282752037,
      "rewards/gemini_judge_reward_func/mean": 0.1205357164144516,
      "rewards/gemini_judge_reward_func/std": 0.2295006811618805,
      "rewards/semantic_correctness_reward_func/mean": 0.4741254448890686,
      "rewards/semantic_correctness_reward_func/std": 0.20388510823249817,
      "rewards/xmlcount_reward_func/mean": 0.708294689655304,
      "rewards/xmlcount_reward_func/std": 0.4551132023334503,
      "step": 460
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 873.0,
      "completions/mean_length": 176.0491180419922,
      "completions/mean_terminated_length": 152.71099853515625,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.15739831846698818,
      "grad_norm": 0.02143760584294796,
      "kl": 0.021224498748779297,
      "learning_rate": 6.3073997047691e-06,
      "loss": -0.0043,
      "num_tokens": 164629886.0,
      "reward": 0.44384151697158813,
      "reward_std": 0.05540228635072708,
      "rewards/gemini_judge_reward_func/mean": 0.1551339328289032,
      "rewards/gemini_judge_reward_func/std": 0.2661517262458801,
      "rewards/semantic_correctness_reward_func/mean": 0.47893956303596497,
      "rewards/semantic_correctness_reward_func/std": 0.21486115455627441,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 461
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 925.0,
      "completions/mean_length": 161.59375,
      "completions/mean_terminated_length": 157.72647094726562,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.15773974648969313,
      "grad_norm": 0.020959317684173584,
      "kl": 0.020437240600585938,
      "learning_rate": 6.291433147091583e-06,
      "loss": -0.0102,
      "num_tokens": 164976791.0,
      "reward": 0.421293705701828,
      "reward_std": 0.05760593339800835,
      "rewards/gemini_judge_reward_func/mean": 0.1272321492433548,
      "rewards/gemini_judge_reward_func/std": 0.2414351999759674,
      "rewards/semantic_correctness_reward_func/mean": 0.42200401425361633,
      "rewards/semantic_correctness_reward_func/std": 0.17594051361083984,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 462
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 847.0,
      "completions/mean_length": 179.82144165039062,
      "completions/mean_terminated_length": 156.58714294433594,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.15808117451239811,
      "grad_norm": 0.018165314570069313,
      "kl": 0.02169036865234375,
      "learning_rate": 6.275452466508076e-06,
      "loss": -0.0342,
      "num_tokens": 165367455.0,
      "reward": 0.36975857615470886,
      "reward_std": 0.05713449418544769,
      "rewards/gemini_judge_reward_func/mean": 0.0870535746216774,
      "rewards/gemini_judge_reward_func/std": 0.20831865072250366,
      "rewards/semantic_correctness_reward_func/mean": 0.42343568801879883,
      "rewards/semantic_correctness_reward_func/std": 0.18575748801231384,
      "rewards/xmlcount_reward_func/mean": 0.6256250739097595,
      "rewards/xmlcount_reward_func/std": 0.48569241166114807,
      "step": 463
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0357142857142857,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 324.0,
      "completions/mean_length": 189.3482208251953,
      "completions/mean_terminated_length": 158.4351806640625,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.15842260253510307,
      "grad_norm": 0.018870696425437927,
      "kl": 0.016434192657470703,
      "learning_rate": 6.259457837780741e-06,
      "loss": -0.0225,
      "num_tokens": 165704941.0,
      "reward": 0.46567752957344055,
      "reward_std": 0.07933323830366135,
      "rewards/gemini_judge_reward_func/mean": 0.1350446492433548,
      "rewards/gemini_judge_reward_func/std": 0.2558937668800354,
      "rewards/semantic_correctness_reward_func/mean": 0.44109275937080383,
      "rewards/semantic_correctness_reward_func/std": 0.2011324018239975,
      "rewards/xmlcount_reward_func/mean": 0.8086027503013611,
      "rewards/xmlcount_reward_func/std": 0.3939329981803894,
      "step": 464
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 750.0,
      "completions/mean_length": 171.37054443359375,
      "completions/mean_terminated_length": 151.9040985107422,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.15876403055780802,
      "grad_norm": 0.01999264396727085,
      "kl": 0.016565322875976562,
      "learning_rate": 6.243449435824276e-06,
      "loss": -0.026,
      "num_tokens": 166053888.0,
      "reward": 0.42705729603767395,
      "reward_std": 0.059816788882017136,
      "rewards/gemini_judge_reward_func/mean": 0.0881696417927742,
      "rewards/gemini_judge_reward_func/std": 0.17328467965126038,
      "rewards/semantic_correctness_reward_func/mean": 0.42169705033302307,
      "rewards/semantic_correctness_reward_func/std": 0.191665381193161,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 465
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 590.0,
      "completions/mean_length": 153.91519165039062,
      "completions/mean_terminated_length": 146.0765838623047,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.159105458580513,
      "grad_norm": 0.019290044903755188,
      "kl": 0.019309520721435547,
      "learning_rate": 6.227427435703997e-06,
      "loss": -0.0179,
      "num_tokens": 166413285.0,
      "reward": 0.40731704235076904,
      "reward_std": 0.05208640545606613,
      "rewards/gemini_judge_reward_func/mean": 0.1071428582072258,
      "rewards/gemini_judge_reward_func/std": 0.2136441320180893,
      "rewards/semantic_correctness_reward_func/mean": 0.4280492961406708,
      "rewards/semantic_correctness_reward_func/std": 0.20282071828842163,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903594970703,
      "step": 466
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 797.0,
      "completions/mean_length": 179.08929443359375,
      "completions/mean_terminated_length": 155.83485412597656,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.15944688660321796,
      "grad_norm": 0.01988377422094345,
      "kl": 0.016337871551513672,
      "learning_rate": 6.211392012633932e-06,
      "loss": -0.0146,
      "num_tokens": 166782753.0,
      "reward": 0.451289564371109,
      "reward_std": 0.06578972935676575,
      "rewards/gemini_judge_reward_func/mean": 0.1194196417927742,
      "rewards/gemini_judge_reward_func/std": 0.2331113964319229,
      "rewards/semantic_correctness_reward_func/mean": 0.46342092752456665,
      "rewards/semantic_correctness_reward_func/std": 0.2133999615907669,
      "rewards/xmlcount_reward_func/mean": 0.777093768119812,
      "rewards/xmlcount_reward_func/std": 0.4154271185398102,
      "step": 467
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 346.0,
      "completions/mean_length": 155.96429443359375,
      "completions/mean_terminated_length": 140.1818084716797,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.1597883146259229,
      "grad_norm": 0.020306117832660675,
      "kl": 0.022162914276123047,
      "learning_rate": 6.1953433419748995e-06,
      "loss": -0.0045,
      "num_tokens": 167132245.0,
      "reward": 0.364609032869339,
      "reward_std": 0.06053777411580086,
      "rewards/gemini_judge_reward_func/mean": 0.0904017835855484,
      "rewards/gemini_judge_reward_func/std": 0.19502632319927216,
      "rewards/semantic_correctness_reward_func/mean": 0.39322349429130554,
      "rewards/semantic_correctness_reward_func/std": 0.21105892956256866,
      "rewards/xmlcount_reward_func/mean": 0.6245089769363403,
      "rewards/xmlcount_reward_func/std": 0.48511284589767456,
      "step": 468
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 720.0,
      "completions/mean_length": 185.08482360839844,
      "completions/mean_terminated_length": 158.02304077148438,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.1601297426486279,
      "grad_norm": 0.046392910182476044,
      "kl": 0.032955169677734375,
      "learning_rate": 6.179281599232592e-06,
      "loss": -0.0111,
      "num_tokens": 167492956.0,
      "reward": 0.40365663170814514,
      "reward_std": 0.05007166042923927,
      "rewards/gemini_judge_reward_func/mean": 0.1216517835855484,
      "rewards/gemini_judge_reward_func/std": 0.23908916115760803,
      "rewards/semantic_correctness_reward_func/mean": 0.4164794981479645,
      "rewards/semantic_correctness_reward_func/std": 0.20123189687728882,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 469
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 453.0,
      "completions/mean_length": 161.73214721679688,
      "completions/mean_terminated_length": 157.865478515625,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.16047117067133285,
      "grad_norm": 0.020119668915867805,
      "kl": 0.015314102172851562,
      "learning_rate": 6.163206960055652e-06,
      "loss": -0.0156,
      "num_tokens": 167859308.0,
      "reward": 0.46480125188827515,
      "reward_std": 0.05998440086841583,
      "rewards/gemini_judge_reward_func/mean": 0.0948660746216774,
      "rewards/gemini_judge_reward_func/std": 0.17457951605319977,
      "rewards/semantic_correctness_reward_func/mean": 0.4182741343975067,
      "rewards/semantic_correctness_reward_func/std": 0.21178297698497772,
      "rewards/xmlcount_reward_func/mean": 0.8580000996589661,
      "rewards/xmlcount_reward_func/std": 0.35106155276298523,
      "step": 470
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 874.0,
      "completions/mean_length": 182.29464721679688,
      "completions/mean_terminated_length": 159.12843322753906,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.1608125986940378,
      "grad_norm": 0.01942136511206627,
      "kl": 0.018321990966796875,
      "learning_rate": 6.147119600233758e-06,
      "loss": -0.0183,
      "num_tokens": 168216734.0,
      "reward": 0.44124796986579895,
      "reward_std": 0.06361334770917892,
      "rewards/gemini_judge_reward_func/mean": 0.1328125,
      "rewards/gemini_judge_reward_func/std": 0.26031482219696045,
      "rewards/semantic_correctness_reward_func/mean": 0.43911466002464294,
      "rewards/semantic_correctness_reward_func/std": 0.2211882770061493,
      "rewards/xmlcount_reward_func/mean": 0.7507500648498535,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 471
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0357142857142857,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 919.0,
      "completions/mean_length": 182.95089721679688,
      "completions/mean_terminated_length": 151.80093383789062,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.16115402671674278,
      "grad_norm": 0.02266276441514492,
      "kl": 0.01579761505126953,
      "learning_rate": 6.131019695695702e-06,
      "loss": -0.0148,
      "num_tokens": 168588127.0,
      "reward": 0.4200635850429535,
      "reward_std": 0.06345196068286896,
      "rewards/gemini_judge_reward_func/mean": 0.1194196417927742,
      "rewards/gemini_judge_reward_func/std": 0.22453764081001282,
      "rewards/semantic_correctness_reward_func/mean": 0.4404517710208893,
      "rewards/semantic_correctness_reward_func/std": 0.19075731933116913,
      "rewards/xmlcount_reward_func/mean": 0.7105134129524231,
      "rewards/xmlcount_reward_func/std": 0.4553045928478241,
      "step": 472
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 353.0,
      "completions/mean_length": 163.4241180419922,
      "completions/mean_terminated_length": 147.7772674560547,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.16149545473944774,
      "grad_norm": 0.020487939938902855,
      "kl": 0.0145416259765625,
      "learning_rate": 6.114907422507459e-06,
      "loss": -0.03,
      "num_tokens": 168923986.0,
      "reward": 0.48719358444213867,
      "reward_std": 0.07441079616546631,
      "rewards/gemini_judge_reward_func/mean": 0.171875,
      "rewards/gemini_judge_reward_func/std": 0.27919498085975647,
      "rewards/semantic_correctness_reward_func/mean": 0.44771772623062134,
      "rewards/semantic_correctness_reward_func/std": 0.22752991318702698,
      "rewards/xmlcount_reward_func/mean": 0.8222500681877136,
      "rewards/xmlcount_reward_func/std": 0.3842346966266632,
      "step": 473
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 984.0,
      "completions/mean_length": 185.5044708251953,
      "completions/mean_terminated_length": 158.4562225341797,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.16183688276215272,
      "grad_norm": 0.019220322370529175,
      "kl": 0.018144607543945312,
      "learning_rate": 6.098782956870266e-06,
      "loss": -0.0505,
      "num_tokens": 169316687.0,
      "reward": 0.383688360452652,
      "reward_std": 0.06443572044372559,
      "rewards/gemini_judge_reward_func/mean": 0.1183035746216774,
      "rewards/gemini_judge_reward_func/std": 0.23187629878520966,
      "rewards/semantic_correctness_reward_func/mean": 0.43058452010154724,
      "rewards/semantic_correctness_reward_func/std": 0.19228224456310272,
      "rewards/xmlcount_reward_func/mean": 0.6256250739097595,
      "rewards/xmlcount_reward_func/std": 0.48569241166114807,
      "step": 474
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 335.0,
      "completions/mean_length": 159.50894165039062,
      "completions/mean_terminated_length": 151.72071838378906,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.16217831078485767,
      "grad_norm": 0.020380543544888496,
      "kl": 0.01611471176147461,
      "learning_rate": 6.0826464751187e-06,
      "loss": -0.0099,
      "num_tokens": 169640537.0,
      "reward": 0.47983115911483765,
      "reward_std": 0.0633583590388298,
      "rewards/gemini_judge_reward_func/mean": 0.1350446492433548,
      "rewards/gemini_judge_reward_func/std": 0.2317548543214798,
      "rewards/semantic_correctness_reward_func/mean": 0.4488253891468048,
      "rewards/semantic_correctness_reward_func/std": 0.2299881875514984,
      "rewards/xmlcount_reward_func/mean": 0.8401206135749817,
      "rewards/xmlcount_reward_func/std": 0.3684559166431427,
      "step": 475
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1014.0,
      "completions/mean_length": 186.27679443359375,
      "completions/mean_terminated_length": 163.2201690673828,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.16251973880756262,
      "grad_norm": 0.01900290884077549,
      "kl": 0.01510167121887207,
      "learning_rate": 6.066498153718735e-06,
      "loss": 0.0154,
      "num_tokens": 170012887.0,
      "reward": 0.4446248412132263,
      "reward_std": 0.051946092396974564,
      "rewards/gemini_judge_reward_func/mean": 0.1216517835855484,
      "rewards/gemini_judge_reward_func/std": 0.23435330390930176,
      "rewards/semantic_correctness_reward_func/mean": 0.44257062673568726,
      "rewards/semantic_correctness_reward_func/std": 0.22105993330478668,
      "rewards/xmlcount_reward_func/mean": 0.7686249613761902,
      "rewards/xmlcount_reward_func/std": 0.42356839776039124,
      "step": 476
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 263.0,
      "completions/mean_length": 152.46875,
      "completions/mean_terminated_length": 148.560546875,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.1628611668302676,
      "grad_norm": 0.01916034147143364,
      "kl": 0.017156600952148438,
      "learning_rate": 6.0503381692658305e-06,
      "loss": -0.007,
      "num_tokens": 170356148.0,
      "reward": 0.4668045938014984,
      "reward_std": 0.06646783649921417,
      "rewards/gemini_judge_reward_func/mean": 0.1339285671710968,
      "rewards/gemini_judge_reward_func/std": 0.2537356913089752,
      "rewards/semantic_correctness_reward_func/mean": 0.4395405948162079,
      "rewards/semantic_correctness_reward_func/std": 0.2333323359489441,
      "rewards/xmlcount_reward_func/mean": 0.8133125305175781,
      "rewards/xmlcount_reward_func/std": 0.39157772064208984,
      "step": 477
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 979.0,
      "completions/mean_length": 167.65625,
      "completions/mean_terminated_length": 152.08636474609375,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.16320259485297256,
      "grad_norm": 0.020782776176929474,
      "kl": 0.01477193832397461,
      "learning_rate": 6.034166698482984e-06,
      "loss": 0.0215,
      "num_tokens": 170715555.0,
      "reward": 0.4693361222743988,
      "reward_std": 0.07258699834346771,
      "rewards/gemini_judge_reward_func/mean": 0.1529017835855484,
      "rewards/gemini_judge_reward_func/std": 0.27057167887687683,
      "rewards/semantic_correctness_reward_func/mean": 0.4678769111633301,
      "rewards/semantic_correctness_reward_func/std": 0.22107519209384918,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 478
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 313.0,
      "completions/max_terminated_length": 313.0,
      "completions/mean_length": 149.625,
      "completions/mean_terminated_length": 149.625,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.1635440228756775,
      "grad_norm": 0.020050855353474617,
      "kl": 0.019238948822021484,
      "learning_rate": 6.0179839182188125e-06,
      "loss": 0.0007,
      "num_tokens": 171058499.0,
      "reward": 0.4546305239200592,
      "reward_std": 0.05301572382450104,
      "rewards/gemini_judge_reward_func/mean": 0.1462053507566452,
      "rewards/gemini_judge_reward_func/std": 0.24282783269882202,
      "rewards/semantic_correctness_reward_func/mean": 0.44431307911872864,
      "rewards/semantic_correctness_reward_func/std": 0.20877443253993988,
      "rewards/xmlcount_reward_func/mean": 0.7682143449783325,
      "rewards/xmlcount_reward_func/std": 0.4233846962451935,
      "step": 479
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 797.0,
      "completions/mean_length": 163.11607360839844,
      "completions/mean_terminated_length": 155.36036682128906,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.1638854508983825,
      "grad_norm": 0.01931094005703926,
      "kl": 0.01724720001220703,
      "learning_rate": 6.001790005445607e-06,
      "loss": -0.0094,
      "num_tokens": 171442269.0,
      "reward": 0.42174994945526123,
      "reward_std": 0.07509782910346985,
      "rewards/gemini_judge_reward_func/mean": 0.140625,
      "rewards/gemini_judge_reward_func/std": 0.2523016333580017,
      "rewards/semantic_correctness_reward_func/mean": 0.464526504278183,
      "rewards/semantic_correctness_reward_func/std": 0.20688366889953613,
      "rewards/xmlcount_reward_func/mean": 0.6814866662025452,
      "rewards/xmlcount_reward_func/std": 0.46647319197654724,
      "step": 480
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 649.0,
      "completions/mean_length": 158.91964721679688,
      "completions/mean_terminated_length": 155.0403594970703,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.16422687892108745,
      "grad_norm": 0.019207188859581947,
      "kl": 0.019034385681152344,
      "learning_rate": 5.985585137257401e-06,
      "loss": -0.0056,
      "num_tokens": 171771759.0,
      "reward": 0.5069795846939087,
      "reward_std": 0.07559894770383835,
      "rewards/gemini_judge_reward_func/mean": 0.1729910671710968,
      "rewards/gemini_judge_reward_func/std": 0.264567494392395,
      "rewards/semantic_correctness_reward_func/mean": 0.47291556000709534,
      "rewards/semantic_correctness_reward_func/std": 0.23877963423728943,
      "rewards/xmlcount_reward_func/mean": 0.8580000996589661,
      "rewards/xmlcount_reward_func/std": 0.35106155276298523,
      "step": 481
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 884.0,
      "completions/mean_length": 155.44644165039062,
      "completions/mean_terminated_length": 147.6216278076172,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.1645683069437924,
      "grad_norm": 0.01969590224325657,
      "kl": 0.015825271606445312,
      "learning_rate": 5.969369490868042e-06,
      "loss": -0.019,
      "num_tokens": 172126495.0,
      "reward": 0.4206188917160034,
      "reward_std": 0.06250383704900742,
      "rewards/gemini_judge_reward_func/mean": 0.0904017835855484,
      "rewards/gemini_judge_reward_func/std": 0.21680375933647156,
      "rewards/semantic_correctness_reward_func/mean": 0.4146478474140167,
      "rewards/semantic_correctness_reward_func/std": 0.20701521635055542,
      "rewards/xmlcount_reward_func/mean": 0.7538214921951294,
      "rewards/xmlcount_reward_func/std": 0.42685988545417786,
      "step": 482
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 286.0,
      "completions/mean_length": 161.5625,
      "completions/mean_terminated_length": 153.7928009033203,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.16490973496649738,
      "grad_norm": 0.02005629986524582,
      "kl": 0.015123367309570312,
      "learning_rate": 5.953143243609235e-06,
      "loss": -0.0137,
      "num_tokens": 172462793.0,
      "reward": 0.4773969054222107,
      "reward_std": 0.06708209216594696,
      "rewards/gemini_judge_reward_func/mean": 0.1361607164144516,
      "rewards/gemini_judge_reward_func/std": 0.27587687969207764,
      "rewards/semantic_correctness_reward_func/mean": 0.4344128966331482,
      "rewards/semantic_correctness_reward_func/std": 0.23222900927066803,
      "rewards/xmlcount_reward_func/mean": 0.8401250243186951,
      "rewards/xmlcount_reward_func/std": 0.3684578537940979,
      "step": 483
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 485.0,
      "completions/mean_length": 159.9553680419922,
      "completions/mean_terminated_length": 152.17117309570312,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.16525116298920234,
      "grad_norm": 0.020192833617329597,
      "kl": 0.014693260192871094,
      "learning_rate": 5.936906572928625e-06,
      "loss": -0.0159,
      "num_tokens": 172781779.0,
      "reward": 0.44325000047683716,
      "reward_std": 0.06479258835315704,
      "rewards/gemini_judge_reward_func/mean": 0.1004464253783226,
      "rewards/gemini_judge_reward_func/std": 0.22325512766838074,
      "rewards/semantic_correctness_reward_func/mean": 0.4066070020198822,
      "rewards/semantic_correctness_reward_func/std": 0.18697205185890198,
      "rewards/xmlcount_reward_func/mean": 0.8043750524520874,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 484
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 980.0,
      "completions/mean_length": 165.2857208251953,
      "completions/mean_terminated_length": 153.62896728515625,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.1655925910119073,
      "grad_norm": 0.02016579918563366,
      "kl": 0.014116764068603516,
      "learning_rate": 5.920659656387836e-06,
      "loss": -0.024,
      "num_tokens": 173160955.0,
      "reward": 0.3937126696109772,
      "reward_std": 0.06098075583577156,
      "rewards/gemini_judge_reward_func/mean": 0.0993303582072258,
      "rewards/gemini_judge_reward_func/std": 0.20615418255329132,
      "rewards/semantic_correctness_reward_func/mean": 0.4114026427268982,
      "rewards/semantic_correctness_reward_func/std": 0.1928534358739853,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 485
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 452.0,
      "completions/mean_length": 175.7053680419922,
      "completions/mean_terminated_length": 152.3577880859375,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.16593401903461227,
      "grad_norm": 0.020396245643496513,
      "kl": 0.014238834381103516,
      "learning_rate": 5.904402671660551e-06,
      "loss": -0.0438,
      "num_tokens": 173516949.0,
      "reward": 0.42300131916999817,
      "reward_std": 0.0696081817150116,
      "rewards/gemini_judge_reward_func/mean": 0.109375,
      "rewards/gemini_judge_reward_func/std": 0.22406068444252014,
      "rewards/semantic_correctness_reward_func/mean": 0.3947562873363495,
      "rewards/semantic_correctness_reward_func/std": 0.20920373499393463,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 486
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 960.0,
      "completions/mean_length": 168.2232208251953,
      "completions/mean_terminated_length": 152.66363525390625,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.16627544705731723,
      "grad_norm": 0.02036086469888687,
      "kl": 0.01621556282043457,
      "learning_rate": 5.8881357965305444e-06,
      "loss": -0.0085,
      "num_tokens": 173878327.0,
      "reward": 0.4611518383026123,
      "reward_std": 0.08240365236997604,
      "rewards/gemini_judge_reward_func/mean": 0.1696428507566452,
      "rewards/gemini_judge_reward_func/std": 0.27956220507621765,
      "rewards/semantic_correctness_reward_func/mean": 0.4649732708930969,
      "rewards/semantic_correctness_reward_func/std": 0.21387185156345367,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 487
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 874.0,
      "completions/mean_length": 165.1116180419922,
      "completions/mean_terminated_length": 153.45249938964844,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.16661687508002218,
      "grad_norm": 0.020625097677111626,
      "kl": 0.015746593475341797,
      "learning_rate": 5.871859208889759e-06,
      "loss": -0.0143,
      "num_tokens": 174251580.0,
      "reward": 0.4308580458164215,
      "reward_std": 0.06439037621021271,
      "rewards/gemini_judge_reward_func/mean": 0.1104910746216774,
      "rewards/gemini_judge_reward_func/std": 0.22288212180137634,
      "rewards/semantic_correctness_reward_func/mean": 0.4266740679740906,
      "rewards/semantic_correctness_reward_func/std": 0.22454437613487244,
      "rewards/xmlcount_reward_func/mean": 0.7533169984817505,
      "rewards/xmlcount_reward_func/std": 0.4281919598579407,
      "step": 488
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 943.0,
      "completions/mean_length": 159.9419708251953,
      "completions/mean_terminated_length": 156.06727600097656,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.16695830310272716,
      "grad_norm": 0.02049202099442482,
      "kl": 0.015564441680908203,
      "learning_rate": 5.855573086736351e-06,
      "loss": -0.0176,
      "num_tokens": 174562027.0,
      "reward": 0.5011279582977295,
      "reward_std": 0.07227544486522675,
      "rewards/gemini_judge_reward_func/mean": 0.1651785671710968,
      "rewards/gemini_judge_reward_func/std": 0.2832260727882385,
      "rewards/semantic_correctness_reward_func/mean": 0.45928245782852173,
      "rewards/semantic_correctness_reward_func/std": 0.20571810007095337,
      "rewards/xmlcount_reward_func/mean": 0.8580000996589661,
      "rewards/xmlcount_reward_func/std": 0.35106155276298523,
      "step": 489
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 615.0,
      "completions/mean_length": 156.5178680419922,
      "completions/mean_terminated_length": 144.74208068847656,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.16729973112543212,
      "grad_norm": 0.020085543394088745,
      "kl": 0.016694068908691406,
      "learning_rate": 5.839277608172739e-06,
      "loss": 0.006,
      "num_tokens": 174919303.0,
      "reward": 0.41649171710014343,
      "reward_std": 0.0659632682800293,
      "rewards/gemini_judge_reward_func/mean": 0.1183035746216774,
      "rewards/gemini_judge_reward_func/std": 0.2142873853445053,
      "rewards/semantic_correctness_reward_func/mean": 0.415851354598999,
      "rewards/semantic_correctness_reward_func/std": 0.17620055377483368,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 490
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 619.0,
      "completions/mean_length": 159.98214721679688,
      "completions/mean_terminated_length": 152.1981964111328,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.16764115914813707,
      "grad_norm": 0.019799429923295975,
      "kl": 0.015845775604248047,
      "learning_rate": 5.82297295140367e-06,
      "loss": -0.0318,
      "num_tokens": 175240287.0,
      "reward": 0.4717130959033966,
      "reward_std": 0.07357439398765564,
      "rewards/gemini_judge_reward_func/mean": 0.1316964328289032,
      "rewards/gemini_judge_reward_func/std": 0.2635558247566223,
      "rewards/semantic_correctness_reward_func/mean": 0.42951181530952454,
      "rewards/semantic_correctness_reward_func/std": 0.229256734251976,
      "rewards/xmlcount_reward_func/mean": 0.8328304290771484,
      "rewards/xmlcount_reward_func/std": 0.373677134513855,
      "step": 491
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 171.95089721679688,
      "completions/mean_terminated_length": 164.2747802734375,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.16798258717084205,
      "grad_norm": 0.020750809460878372,
      "kl": 0.015099048614501953,
      "learning_rate": 5.806659294734256e-06,
      "loss": 0.0052,
      "num_tokens": 175559164.0,
      "reward": 0.46661117672920227,
      "reward_std": 0.08188282698392868,
      "rewards/gemini_judge_reward_func/mean": 0.1372767835855484,
      "rewards/gemini_judge_reward_func/std": 0.2399667352437973,
      "rewards/semantic_correctness_reward_func/mean": 0.4586896598339081,
      "rewards/semantic_correctness_reward_func/std": 0.1945486068725586,
      "rewards/xmlcount_reward_func/mean": 0.799906313419342,
      "rewards/xmlcount_reward_func/std": 0.40196701884269714,
      "step": 492
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 346.0,
      "completions/mean_length": 162.375,
      "completions/mean_terminated_length": 154.61260986328125,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.168324015193547,
      "grad_norm": 0.019258791580796242,
      "kl": 0.013692378997802734,
      "learning_rate": 5.790336816568033e-06,
      "loss": -0.0041,
      "num_tokens": 175921760.0,
      "reward": 0.4894059896469116,
      "reward_std": 0.07220742106437683,
      "rewards/gemini_judge_reward_func/mean": 0.1495535671710968,
      "rewards/gemini_judge_reward_func/std": 0.2820485234260559,
      "rewards/semantic_correctness_reward_func/mean": 0.4319226145744324,
      "rewards/semantic_correctness_reward_func/std": 0.2171761691570282,
      "rewards/xmlcount_reward_func/mean": 0.8580000996589661,
      "rewards/xmlcount_reward_func/std": 0.35106152296066284,
      "step": 493
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 275.0,
      "completions/mean_length": 149.09375,
      "completions/mean_terminated_length": 141.2117156982422,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.168665443216252,
      "grad_norm": 0.020740794017910957,
      "kl": 0.01919412612915039,
      "learning_rate": 5.774005695405008e-06,
      "loss": 0.0211,
      "num_tokens": 176274773.0,
      "reward": 0.42292988300323486,
      "reward_std": 0.061523277312517166,
      "rewards/gemini_judge_reward_func/mean": 0.0982142835855484,
      "rewards/gemini_judge_reward_func/std": 0.20736749470233917,
      "rewards/semantic_correctness_reward_func/mean": 0.4547029137611389,
      "rewards/semantic_correctness_reward_func/std": 0.18523745238780975,
      "rewards/xmlcount_reward_func/mean": 0.7317589521408081,
      "rewards/xmlcount_reward_func/std": 0.4439156651496887,
      "step": 494
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 712.0,
      "completions/mean_length": 156.82144165039062,
      "completions/mean_terminated_length": 145.04977416992188,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.16900687123895694,
      "grad_norm": 0.02109484001994133,
      "kl": 0.018070220947265625,
      "learning_rate": 5.7576661098397024e-06,
      "loss": -0.022,
      "num_tokens": 176663369.0,
      "reward": 0.415505588054657,
      "reward_std": 0.07010926306247711,
      "rewards/gemini_judge_reward_func/mean": 0.1774553507566452,
      "rewards/gemini_judge_reward_func/std": 0.3051034212112427,
      "rewards/semantic_correctness_reward_func/mean": 0.4624207317829132,
      "rewards/semantic_correctness_reward_func/std": 0.2060934156179428,
      "rewards/xmlcount_reward_func/mean": 0.6300982236862183,
      "rewards/xmlcount_reward_func/std": 0.48218870162963867,
      "step": 495
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 250.0,
      "completions/mean_length": 154.91519165039062,
      "completions/mean_terminated_length": 147.08558654785156,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.1693482992616619,
      "grad_norm": 0.020870883017778397,
      "kl": 0.014084815979003906,
      "learning_rate": 5.74131823855921e-06,
      "loss": -0.0061,
      "num_tokens": 177009766.0,
      "reward": 0.4837039113044739,
      "reward_std": 0.06995401531457901,
      "rewards/gemini_judge_reward_func/mean": 0.1696428507566452,
      "rewards/gemini_judge_reward_func/std": 0.28156012296676636,
      "rewards/semantic_correctness_reward_func/mean": 0.4704835116863251,
      "rewards/semantic_correctness_reward_func/std": 0.23031000792980194,
      "rewards/xmlcount_reward_func/mean": 0.8043751120567322,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 496
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 688.0,
      "completions/mean_length": 172.81251525878906,
      "completions/mean_terminated_length": 149.38531494140625,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.16968972728436688,
      "grad_norm": 0.020632216706871986,
      "kl": 0.017904996871948242,
      "learning_rate": 5.72496226034123e-06,
      "loss": -0.0237,
      "num_tokens": 177378632.0,
      "reward": 0.4311949610710144,
      "reward_std": 0.06168566271662712,
      "rewards/gemini_judge_reward_func/mean": 0.1071428582072258,
      "rewards/gemini_judge_reward_func/std": 0.24539317190647125,
      "rewards/semantic_correctness_reward_func/mean": 0.44018909335136414,
      "rewards/semantic_correctness_reward_func/std": 0.20811137557029724,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 497
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 924.0,
      "completions/mean_length": 165.08482360839844,
      "completions/mean_terminated_length": 153.4253387451172,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.17003115530707183,
      "grad_norm": 0.01952049508690834,
      "kl": 0.01712656021118164,
      "learning_rate": 5.708598354052122e-06,
      "loss": 0.007,
      "num_tokens": 177727219.0,
      "reward": 0.4157637357711792,
      "reward_std": 0.05787918344140053,
      "rewards/gemini_judge_reward_func/mean": 0.1395089328289032,
      "rewards/gemini_judge_reward_func/std": 0.25457999110221863,
      "rewards/semantic_correctness_reward_func/mean": 0.4413006007671356,
      "rewards/semantic_correctness_reward_func/std": 0.212608203291893,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 498
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 435.0,
      "completions/mean_length": 152.03125,
      "completions/mean_terminated_length": 148.12107849121094,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.17037258332977678,
      "grad_norm": 0.020889515057206154,
      "kl": 0.017127513885498047,
      "learning_rate": 5.692226698644938e-06,
      "loss": -0.0171,
      "num_tokens": 178073166.0,
      "reward": 0.466022253036499,
      "reward_std": 0.07162578403949738,
      "rewards/gemini_judge_reward_func/mean": 0.1540178507566452,
      "rewards/gemini_judge_reward_func/std": 0.2349638193845749,
      "rewards/semantic_correctness_reward_func/mean": 0.4848252236843109,
      "rewards/semantic_correctness_reward_func/std": 0.20572321116924286,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 499
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 730.0,
      "completions/mean_length": 161.67857360839844,
      "completions/mean_terminated_length": 146.0,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.17071401135248176,
      "grad_norm": 0.020642045885324478,
      "kl": 0.01847362518310547,
      "learning_rate": 5.675847473157485e-06,
      "loss": -0.0314,
      "num_tokens": 178409774.0,
      "reward": 0.47827115654945374,
      "reward_std": 0.05761899799108505,
      "rewards/gemini_judge_reward_func/mean": 0.15625,
      "rewards/gemini_judge_reward_func/std": 0.2660224437713623,
      "rewards/semantic_correctness_reward_func/mean": 0.47010570764541626,
      "rewards/semantic_correctness_reward_func/std": 0.2151022106409073,
      "rewards/xmlcount_reward_func/mean": 0.8043750524520874,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 500
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 386.0,
      "completions/mean_length": 152.6741180419922,
      "completions/mean_terminated_length": 148.76683044433594,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.17105543937518672,
      "grad_norm": 0.021922262385487556,
      "kl": 0.014860153198242188,
      "learning_rate": 5.659460856710346e-06,
      "loss": -0.0213,
      "num_tokens": 178727305.0,
      "reward": 0.45994651317596436,
      "reward_std": 0.06950204819440842,
      "rewards/gemini_judge_reward_func/mean": 0.1462053507566452,
      "rewards/gemini_judge_reward_func/std": 0.26806291937828064,
      "rewards/semantic_correctness_reward_func/mean": 0.47231271862983704,
      "rewards/semantic_correctness_reward_func/std": 0.22776320576667786,
      "rewards/xmlcount_reward_func/mean": 0.7675045728683472,
      "rewards/xmlcount_reward_func/std": 0.42328277230262756,
      "step": 501
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 963.0,
      "completions/mean_length": 173.3928680419922,
      "completions/mean_terminated_length": 149.9816436767578,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.17139686739789167,
      "grad_norm": 0.021201200783252716,
      "kl": 0.018434524536132812,
      "learning_rate": 5.643067028504931e-06,
      "loss": -0.0353,
      "num_tokens": 179102705.0,
      "reward": 0.40908685326576233,
      "reward_std": 0.06099972128868103,
      "rewards/gemini_judge_reward_func/mean": 0.1272321492433548,
      "rewards/gemini_judge_reward_func/std": 0.21042221784591675,
      "rewards/semantic_correctness_reward_func/mean": 0.4324699938297272,
      "rewards/semantic_correctness_reward_func/std": 0.19328351318836212,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 502
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 664.0,
      "completions/mean_length": 169.33929443359375,
      "completions/mean_terminated_length": 149.82647705078125,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.17173829542059665,
      "grad_norm": 0.01934802159667015,
      "kl": 0.016072750091552734,
      "learning_rate": 5.626666167821522e-06,
      "loss": -0.0409,
      "num_tokens": 179471349.0,
      "reward": 0.420356422662735,
      "reward_std": 0.07176917046308517,
      "rewards/gemini_judge_reward_func/mean": 0.1473214328289032,
      "rewards/gemini_judge_reward_func/std": 0.2562088966369629,
      "rewards/semantic_correctness_reward_func/mean": 0.44863906502723694,
      "rewards/semantic_correctness_reward_func/std": 0.2033475935459137,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 503
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 273.0,
      "completions/mean_length": 157.10714721679688,
      "completions/mean_terminated_length": 153.21974182128906,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.1720797234433016,
      "grad_norm": 0.021044226363301277,
      "kl": 0.016061782836914062,
      "learning_rate": 5.610258454017301e-06,
      "loss": -0.0048,
      "num_tokens": 179818133.0,
      "reward": 0.4817381203174591,
      "reward_std": 0.05548393726348877,
      "rewards/gemini_judge_reward_func/mean": 0.1696428507566452,
      "rewards/gemini_judge_reward_func/std": 0.2855140268802643,
      "rewards/semantic_correctness_reward_func/mean": 0.46065473556518555,
      "rewards/semantic_correctness_reward_func/std": 0.23221111297607422,
      "rewards/xmlcount_reward_func/mean": 0.8043751120567322,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 504
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0357142857142857,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 893.0,
      "completions/mean_length": 183.8928680419922,
      "completions/mean_terminated_length": 152.7777862548828,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.17242115146600656,
      "grad_norm": 0.019027721136808395,
      "kl": 0.014852523803710938,
      "learning_rate": 5.593844066524401e-06,
      "loss": -0.0089,
      "num_tokens": 180172325.0,
      "reward": 0.39402300119400024,
      "reward_std": 0.05319085344672203,
      "rewards/gemini_judge_reward_func/mean": 0.0915178582072258,
      "rewards/gemini_judge_reward_func/std": 0.2220863550901413,
      "rewards/semantic_correctness_reward_func/mean": 0.3928290903568268,
      "rewards/semantic_correctness_reward_func/std": 0.20905858278274536,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903594970703,
      "step": 505
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 627.0,
      "completions/mean_length": 177.88839721679688,
      "completions/mean_terminated_length": 154.6009063720703,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.17276257948871154,
      "grad_norm": 0.02053193189203739,
      "kl": 0.01651144027709961,
      "learning_rate": 5.577423184847932e-06,
      "loss": -0.0192,
      "num_tokens": 180531340.0,
      "reward": 0.4295152425765991,
      "reward_std": 0.05642404407262802,
      "rewards/gemini_judge_reward_func/mean": 0.0837053582072258,
      "rewards/gemini_judge_reward_func/std": 0.18177036941051483,
      "rewards/semantic_correctness_reward_func/mean": 0.4116385877132416,
      "rewards/semantic_correctness_reward_func/std": 0.17686069011688232,
      "rewards/xmlcount_reward_func/mean": 0.7842634916305542,
      "rewards/xmlcount_reward_func/std": 0.41184648871421814,
      "step": 506
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 364.0,
      "completions/mean_length": 165.13839721679688,
      "completions/mean_terminated_length": 149.52272033691406,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.1731040075114165,
      "grad_norm": 0.01986338570713997,
      "kl": 0.013272762298583984,
      "learning_rate": 5.560995988564023e-06,
      "loss": 0.014,
      "num_tokens": 180879259.0,
      "reward": 0.4700644016265869,
      "reward_std": 0.08002512902021408,
      "rewards/gemini_judge_reward_func/mean": 0.1540178507566452,
      "rewards/gemini_judge_reward_func/std": 0.26522162556648254,
      "rewards/semantic_correctness_reward_func/mean": 0.43353599309921265,
      "rewards/semantic_correctness_reward_func/std": 0.20602788031101227,
      "rewards/xmlcount_reward_func/mean": 0.8043751120567322,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 507
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 367.0,
      "completions/mean_length": 157.4553680419922,
      "completions/mean_terminated_length": 145.6923065185547,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.17344543553412145,
      "grad_norm": 0.02088193967938423,
      "kl": 0.015688419342041016,
      "learning_rate": 5.544562657317863e-06,
      "loss": -0.0335,
      "num_tokens": 181218669.0,
      "reward": 0.4463382661342621,
      "reward_std": 0.07036608457565308,
      "rewards/gemini_judge_reward_func/mean": 0.1238839253783226,
      "rewards/gemini_judge_reward_func/std": 0.24027937650680542,
      "rewards/semantic_correctness_reward_func/mean": 0.4466731548309326,
      "rewards/semantic_correctness_reward_func/std": 0.2101128250360489,
      "rewards/xmlcount_reward_func/mean": 0.7686249613761902,
      "rewards/xmlcount_reward_func/std": 0.42356839776039124,
      "step": 508
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 464.0,
      "completions/mean_length": 164.67857360839844,
      "completions/mean_terminated_length": 145.05935668945312,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.17378686355682643,
      "grad_norm": 0.018736766651272774,
      "kl": 0.014445781707763672,
      "learning_rate": 5.52812337082173e-06,
      "loss": -0.0104,
      "num_tokens": 181593377.0,
      "reward": 0.43417415022850037,
      "reward_std": 0.06118585541844368,
      "rewards/gemini_judge_reward_func/mean": 0.1328125,
      "rewards/gemini_judge_reward_func/std": 0.26245927810668945,
      "rewards/semantic_correctness_reward_func/mean": 0.41679030656814575,
      "rewards/semantic_correctness_reward_func/std": 0.2354862242937088,
      "rewards/xmlcount_reward_func/mean": 0.7442277073860168,
      "rewards/xmlcount_reward_func/std": 0.4368407428264618,
      "step": 509
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 580.0,
      "completions/max_terminated_length": 580.0,
      "completions/mean_length": 151.02679443359375,
      "completions/mean_terminated_length": 151.02679443359375,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.17412829157953139,
      "grad_norm": 0.02046004869043827,
      "kl": 0.01661968231201172,
      "learning_rate": 5.5116783088530255e-06,
      "loss": -0.0076,
      "num_tokens": 181941603.0,
      "reward": 0.44803786277770996,
      "reward_std": 0.07080157846212387,
      "rewards/gemini_judge_reward_func/mean": 0.1618303507566452,
      "rewards/gemini_judge_reward_func/std": 0.27668073773384094,
      "rewards/semantic_correctness_reward_func/mean": 0.4507783055305481,
      "rewards/semantic_correctness_reward_func/std": 0.22416646778583527,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 510
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 982.0,
      "completions/mean_length": 181.9419708251953,
      "completions/mean_terminated_length": 166.63180541992188,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.17446971960223637,
      "grad_norm": 0.021340545266866684,
      "kl": 0.015408992767333984,
      "learning_rate": 5.495227651252315e-06,
      "loss": -0.0083,
      "num_tokens": 182312514.0,
      "reward": 0.41029632091522217,
      "reward_std": 0.06794978678226471,
      "rewards/gemini_judge_reward_func/mean": 0.1261160671710968,
      "rewards/gemini_judge_reward_func/std": 0.2391100972890854,
      "rewards/semantic_correctness_reward_func/mean": 0.44074922800064087,
      "rewards/semantic_correctness_reward_func/std": 0.20661385357379913,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 511
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 271.0,
      "completions/mean_length": 167.875,
      "completions/mean_terminated_length": 152.30908203125,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.17481114762494132,
      "grad_norm": 0.01973101496696472,
      "kl": 0.014441490173339844,
      "learning_rate": 5.478771577921351e-06,
      "loss": -0.0045,
      "num_tokens": 182654030.0,
      "reward": 0.47068971395492554,
      "reward_std": 0.0690293237566948,
      "rewards/gemini_judge_reward_func/mean": 0.140625,
      "rewards/gemini_judge_reward_func/std": 0.2746405303478241,
      "rewards/semantic_correctness_reward_func/mean": 0.46344852447509766,
      "rewards/semantic_correctness_reward_func/std": 0.2121262550354004,
      "rewards/xmlcount_reward_func/mean": 0.8043750524520874,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 512
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 791.0,
      "completions/mean_length": 161.9107208251953,
      "completions/mean_terminated_length": 154.14414978027344,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.17515257564764627,
      "grad_norm": 0.01982654258608818,
      "kl": 0.015091419219970703,
      "learning_rate": 5.4623102688211186e-06,
      "loss": -0.0241,
      "num_tokens": 182983714.0,
      "reward": 0.4770260155200958,
      "reward_std": 0.07111723721027374,
      "rewards/gemini_judge_reward_func/mean": 0.1116071417927742,
      "rewards/gemini_judge_reward_func/std": 0.2242058366537094,
      "rewards/semantic_correctness_reward_func/mean": 0.41016557812690735,
      "rewards/semantic_correctness_reward_func/std": 0.1983053833246231,
      "rewards/xmlcount_reward_func/mean": 0.8758750557899475,
      "rewards/xmlcount_reward_func/std": 0.33179107308387756,
      "step": 513
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 813.0,
      "completions/mean_length": 169.22769165039062,
      "completions/mean_terminated_length": 149.7123260498047,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.17549400367035126,
      "grad_norm": 0.018747977912425995,
      "kl": 0.015319347381591797,
      "learning_rate": 5.445843903969854e-06,
      "loss": -0.0188,
      "num_tokens": 183362849.0,
      "reward": 0.38659247756004333,
      "reward_std": 0.04423471912741661,
      "rewards/gemini_judge_reward_func/mean": 0.0691964253783226,
      "rewards/gemini_judge_reward_func/std": 0.1560075730085373,
      "rewards/semantic_correctness_reward_func/mean": 0.40031924843788147,
      "rewards/semantic_correctness_reward_func/std": 0.18306925892829895,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903892993927,
      "step": 514
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 931.0,
      "completions/mean_length": 169.09375,
      "completions/mean_terminated_length": 161.3918914794922,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.1758354316930562,
      "grad_norm": 0.020533520728349686,
      "kl": 0.01604938507080078,
      "learning_rate": 5.429372663441086e-06,
      "loss": -0.0012,
      "num_tokens": 183731974.0,
      "reward": 0.4428136944770813,
      "reward_std": 0.07406413555145264,
      "rewards/gemini_judge_reward_func/mean": 0.1439732164144516,
      "rewards/gemini_judge_reward_func/std": 0.26507771015167236,
      "rewards/semantic_correctness_reward_func/mean": 0.4514612555503845,
      "rewards/semantic_correctness_reward_func/std": 0.20401346683502197,
      "rewards/xmlcount_reward_func/mean": 0.737330436706543,
      "rewards/xmlcount_reward_func/std": 0.439359575510025,
      "step": 515
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 319.0,
      "completions/mean_length": 162.94644165039062,
      "completions/mean_terminated_length": 143.28765869140625,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "epoch": 0.17617685971576116,
      "grad_norm": 0.02042176015675068,
      "kl": 0.01526498794555664,
      "learning_rate": 5.412896727361663e-06,
      "loss": 0.0011,
      "num_tokens": 184076814.0,
      "reward": 0.42131778597831726,
      "reward_std": 0.06919633597135544,
      "rewards/gemini_judge_reward_func/mean": 0.1484375,
      "rewards/gemini_judge_reward_func/std": 0.22710849344730377,
      "rewards/semantic_correctness_reward_func/mean": 0.4512138366699219,
      "rewards/semantic_correctness_reward_func/std": 0.19391006231307983,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 516
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 355.0,
      "completions/mean_length": 159.17857360839844,
      "completions/mean_terminated_length": 147.4389190673828,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "epoch": 0.17651828773846615,
      "grad_norm": 0.02078302949666977,
      "kl": 0.014979839324951172,
      "learning_rate": 5.396416275909779e-06,
      "loss": 0.0254,
      "num_tokens": 184454794.0,
      "reward": 0.4277709722518921,
      "reward_std": 0.06490686535835266,
      "rewards/gemini_judge_reward_func/mean": 0.1104910746216774,
      "rewards/gemini_judge_reward_func/std": 0.21258442103862762,
      "rewards/semantic_correctness_reward_func/mean": 0.404167115688324,
      "rewards/semantic_correctness_reward_func/std": 0.19909650087356567,
      "rewards/xmlcount_reward_func/mean": 0.7568526864051819,
      "rewards/xmlcount_reward_func/std": 0.42892637848854065,
      "step": 517
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 450.0,
      "completions/mean_length": 163.63839721679688,
      "completions/mean_terminated_length": 151.95928955078125,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.1768597157611711,
      "grad_norm": 0.020726703107357025,
      "kl": 0.017206192016601562,
      "learning_rate": 5.379931489313016e-06,
      "loss": 0.0103,
      "num_tokens": 184798381.0,
      "reward": 0.44928938150405884,
      "reward_std": 0.058755356818437576,
      "rewards/gemini_judge_reward_func/mean": 0.1316964328289032,
      "rewards/gemini_judge_reward_func/std": 0.28005409240722656,
      "rewards/semantic_correctness_reward_func/mean": 0.44580385088920593,
      "rewards/semantic_correctness_reward_func/std": 0.22318150103092194,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 518
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 326.0,
      "completions/mean_length": 158.07589721679688,
      "completions/mean_terminated_length": 138.30592346191406,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.17720114378387605,
      "grad_norm": 0.02273509092628956,
      "kl": 0.017033815383911133,
      "learning_rate": 5.363442547846356e-06,
      "loss": -0.0056,
      "num_tokens": 185169506.0,
      "reward": 0.4369128942489624,
      "reward_std": 0.08179816603660583,
      "rewards/gemini_judge_reward_func/mean": 0.1517857164144516,
      "rewards/gemini_judge_reward_func/std": 0.28283703327178955,
      "rewards/semantic_correctness_reward_func/mean": 0.45099279284477234,
      "rewards/semantic_correctness_reward_func/std": 0.21851813793182373,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 519
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 651.0,
      "completions/mean_length": 167.00894165039062,
      "completions/mean_terminated_length": 155.37557983398438,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.17754257180658103,
      "grad_norm": 0.02012968808412552,
      "kl": 0.015107154846191406,
      "learning_rate": 5.346949631830221e-06,
      "loss": -0.0077,
      "num_tokens": 185521656.0,
      "reward": 0.434848815202713,
      "reward_std": 0.07218959927558899,
      "rewards/gemini_judge_reward_func/mean": 0.140625,
      "rewards/gemini_judge_reward_func/std": 0.25888094305992126,
      "rewards/semantic_correctness_reward_func/mean": 0.4599758982658386,
      "rewards/semantic_correctness_reward_func/std": 0.19838641583919525,
      "rewards/xmlcount_reward_func/mean": 0.7165089845657349,
      "rewards/xmlcount_reward_func/std": 0.45138630270957947,
      "step": 520
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0357142857142857,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 928.0,
      "completions/mean_length": 180.83482360839844,
      "completions/mean_terminated_length": 149.60647583007812,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.177883999829286,
      "grad_norm": 0.021219704300165176,
      "kl": 0.01463770866394043,
      "learning_rate": 5.3304529216284974e-06,
      "loss": -0.0383,
      "num_tokens": 185866755.0,
      "reward": 0.4066275954246521,
      "reward_std": 0.0540161170065403,
      "rewards/gemini_judge_reward_func/mean": 0.1037946417927742,
      "rewards/gemini_judge_reward_func/std": 0.20530273020267487,
      "rewards/semantic_correctness_reward_func/mean": 0.43129876255989075,
      "rewards/semantic_correctness_reward_func/std": 0.20759737491607666,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903892993927,
      "step": 521
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 861.0,
      "completions/mean_length": 184.32589721679688,
      "completions/mean_terminated_length": 161.21559143066406,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.17822542785199094,
      "grad_norm": 0.020118800923228264,
      "kl": 0.014031648635864258,
      "learning_rate": 5.3139525976465675e-06,
      "loss": -0.0155,
      "num_tokens": 186255640.0,
      "reward": 0.4363122284412384,
      "reward_std": 0.056919172406196594,
      "rewards/gemini_judge_reward_func/mean": 0.1428571492433548,
      "rewards/gemini_judge_reward_func/std": 0.24879594147205353,
      "rewards/semantic_correctness_reward_func/mean": 0.4300965964794159,
      "rewards/semantic_correctness_reward_func/std": 0.21386457979679108,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 522
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 907.0,
      "completions/mean_length": 157.47769165039062,
      "completions/mean_terminated_length": 153.59193420410156,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.17856685587469592,
      "grad_norm": 0.01910630241036415,
      "kl": 0.01791095733642578,
      "learning_rate": 5.2974488403293285e-06,
      "loss": -0.0158,
      "num_tokens": 186620139.0,
      "reward": 0.4440222680568695,
      "reward_std": 0.06990361213684082,
      "rewards/gemini_judge_reward_func/mean": 0.1696428507566452,
      "rewards/gemini_judge_reward_func/std": 0.2913442552089691,
      "rewards/semantic_correctness_reward_func/mean": 0.48657554388046265,
      "rewards/semantic_correctness_reward_func/std": 0.2125052511692047,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903892993927,
      "step": 523
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0357142857142857,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 554.0,
      "completions/mean_length": 179.77679443359375,
      "completions/mean_terminated_length": 148.50926208496094,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.17890828389740088,
      "grad_norm": 0.019049007445573807,
      "kl": 0.015116214752197266,
      "learning_rate": 5.280941830159228e-06,
      "loss": -0.0286,
      "num_tokens": 187000085.0,
      "reward": 0.45324477553367615,
      "reward_std": 0.07729536294937134,
      "rewards/gemini_judge_reward_func/mean": 0.1729910671710968,
      "rewards/gemini_judge_reward_func/std": 0.29080912470817566,
      "rewards/semantic_correctness_reward_func/mean": 0.49024146795272827,
      "rewards/semantic_correctness_reward_func/std": 0.21569402515888214,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 524
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 841.0,
      "completions/mean_length": 160.98214721679688,
      "completions/mean_terminated_length": 149.2669677734375,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.17924971192010583,
      "grad_norm": 0.020367106422781944,
      "kl": 0.016560077667236328,
      "learning_rate": 5.264431747654284e-06,
      "loss": 0.0176,
      "num_tokens": 187355309.0,
      "reward": 0.42137518525123596,
      "reward_std": 0.0671260803937912,
      "rewards/gemini_judge_reward_func/mean": 0.1529017835855484,
      "rewards/gemini_judge_reward_func/std": 0.25565895438194275,
      "rewards/semantic_correctness_reward_func/mean": 0.43138477206230164,
      "rewards/semantic_correctness_reward_func/std": 0.203949972987175,
      "rewards/xmlcount_reward_func/mean": 0.6848437190055847,
      "rewards/xmlcount_reward_func/std": 0.46544113755226135,
      "step": 525
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 362.0,
      "completions/mean_length": 156.57144165039062,
      "completions/mean_terminated_length": 144.79638671875,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.1795911399428108,
      "grad_norm": 0.02240682952105999,
      "kl": 0.013727188110351562,
      "learning_rate": 5.247918773366112e-06,
      "loss": -0.0034,
      "num_tokens": 187717201.0,
      "reward": 0.44608941674232483,
      "reward_std": 0.07263628393411636,
      "rewards/gemini_judge_reward_func/mean": 0.1383928507566452,
      "rewards/gemini_judge_reward_func/std": 0.26116180419921875,
      "rewards/semantic_correctness_reward_func/mean": 0.4700183868408203,
      "rewards/semantic_correctness_reward_func/std": 0.23790188133716583,
      "rewards/xmlcount_reward_func/mean": 0.7418214082717896,
      "rewards/xmlcount_reward_func/std": 0.4343191385269165,
      "step": 526
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 454.0,
      "completions/mean_length": 171.1919708251953,
      "completions/mean_terminated_length": 151.72145080566406,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.17993256796551577,
      "grad_norm": 0.020298024639487267,
      "kl": 0.012287616729736328,
      "learning_rate": 5.231403087877955e-06,
      "loss": 0.0,
      "num_tokens": 188043636.0,
      "reward": 0.5017148852348328,
      "reward_std": 0.05507539212703705,
      "rewards/gemini_judge_reward_func/mean": 0.1930803507566452,
      "rewards/gemini_judge_reward_func/std": 0.29825180768966675,
      "rewards/semantic_correctness_reward_func/mean": 0.4779132306575775,
      "rewards/semantic_correctness_reward_func/std": 0.2128426432609558,
      "rewards/xmlcount_reward_func/mean": 0.8222500681877136,
      "rewards/xmlcount_reward_func/std": 0.3842346966266632,
      "step": 527
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 186.10714721679688,
      "completions/mean_terminated_length": 159.07833862304688,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.18027399598822072,
      "grad_norm": 0.02056858502328396,
      "kl": 0.015903472900390625,
      "learning_rate": 5.214884871802703e-06,
      "loss": -0.0112,
      "num_tokens": 188390548.0,
      "reward": 0.42851802706718445,
      "reward_std": 0.06244615837931633,
      "rewards/gemini_judge_reward_func/mean": 0.1328125,
      "rewards/gemini_judge_reward_func/std": 0.24931590259075165,
      "rewards/semantic_correctness_reward_func/mean": 0.4469650685787201,
      "rewards/semantic_correctness_reward_func/std": 0.2338939607143402,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 528
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 825.0,
      "completions/mean_length": 164.9553680419922,
      "completions/mean_terminated_length": 149.33636474609375,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.1806154240109257,
      "grad_norm": 0.01978747919201851,
      "kl": 0.01563262939453125,
      "learning_rate": 5.198364305780922e-06,
      "loss": -0.0333,
      "num_tokens": 188762902.0,
      "reward": 0.4285474121570587,
      "reward_std": 0.06751622259616852,
      "rewards/gemini_judge_reward_func/mean": 0.1205357164144516,
      "rewards/gemini_judge_reward_func/std": 0.2482723742723465,
      "rewards/semantic_correctness_reward_func/mean": 0.4270046353340149,
      "rewards/semantic_correctness_reward_func/std": 0.20151624083518982,
      "rewards/xmlcount_reward_func/mean": 0.737330436706543,
      "rewards/xmlcount_reward_func/std": 0.4393596053123474,
      "step": 529
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 495.0,
      "completions/mean_length": 152.0625,
      "completions/mean_terminated_length": 148.15248107910156,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.18095685203363066,
      "grad_norm": 0.020898984745144844,
      "kl": 0.01631021499633789,
      "learning_rate": 5.1818415704788725e-06,
      "loss": -0.0143,
      "num_tokens": 189098264.0,
      "reward": 0.41772809624671936,
      "reward_std": 0.05771636217832565,
      "rewards/gemini_judge_reward_func/mean": 0.0982142835855484,
      "rewards/gemini_judge_reward_func/std": 0.19482412934303284,
      "rewards/semantic_correctness_reward_func/mean": 0.39071187376976013,
      "rewards/semantic_correctness_reward_func/std": 0.19836723804473877,
      "rewards/xmlcount_reward_func/mean": 0.7507500648498535,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 530
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 402.0,
      "completions/mean_length": 148.0178680419922,
      "completions/mean_terminated_length": 136.126708984375,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.18129828005633564,
      "grad_norm": 0.02229609526693821,
      "kl": 0.016646385192871094,
      "learning_rate": 5.165316846586541e-06,
      "loss": -0.0295,
      "num_tokens": 189447896.0,
      "reward": 0.4175770878791809,
      "reward_std": 0.07173692435026169,
      "rewards/gemini_judge_reward_func/mean": 0.1261160671710968,
      "rewards/gemini_judge_reward_func/std": 0.244900643825531,
      "rewards/semantic_correctness_reward_func/mean": 0.4398496448993683,
      "rewards/semantic_correctness_reward_func/std": 0.2049618512392044,
      "rewards/xmlcount_reward_func/mean": 0.6979018449783325,
      "rewards/xmlcount_reward_func/std": 0.45866456627845764,
      "step": 531
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 935.0,
      "completions/mean_length": 167.4375,
      "completions/mean_terminated_length": 151.86363220214844,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.1816397080790406,
      "grad_norm": 0.022729597985744476,
      "kl": 0.016128063201904297,
      "learning_rate": 5.148790314815662e-06,
      "loss": -0.0083,
      "num_tokens": 189805810.0,
      "reward": 0.40617290139198303,
      "reward_std": 0.08018369972705841,
      "rewards/gemini_judge_reward_func/mean": 0.1462053507566452,
      "rewards/gemini_judge_reward_func/std": 0.2627832591533661,
      "rewards/semantic_correctness_reward_func/mean": 0.4157036244869232,
      "rewards/semantic_correctness_reward_func/std": 0.19158992171287537,
      "rewards/xmlcount_reward_func/mean": 0.6613750457763672,
      "rewards/xmlcount_reward_func/std": 0.47500187158584595,
      "step": 532
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 468.0,
      "completions/mean_length": 150.22769165039062,
      "completions/mean_terminated_length": 142.35586547851562,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.18198113610174554,
      "grad_norm": 0.021641992032527924,
      "kl": 0.016852378845214844,
      "learning_rate": 5.132262155897739e-06,
      "loss": 0.0189,
      "num_tokens": 190160581.0,
      "reward": 0.4324653148651123,
      "reward_std": 0.07647126168012619,
      "rewards/gemini_judge_reward_func/mean": 0.1473214328289032,
      "rewards/gemini_judge_reward_func/std": 0.294861763715744,
      "rewards/semantic_correctness_reward_func/mean": 0.4376834034919739,
      "rewards/semantic_correctness_reward_func/std": 0.23118866980075836,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 533
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 589.0,
      "completions/mean_length": 152.13394165039062,
      "completions/mean_terminated_length": 144.27928161621094,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.18232256412445053,
      "grad_norm": 0.020904729142785072,
      "kl": 0.017549514770507812,
      "learning_rate": 5.11573255058207e-06,
      "loss": 0.0034,
      "num_tokens": 190511795.0,
      "reward": 0.4001460373401642,
      "reward_std": 0.05920318514108658,
      "rewards/gemini_judge_reward_func/mean": 0.0881696417927742,
      "rewards/gemini_judge_reward_func/std": 0.19316980242729187,
      "rewards/semantic_correctness_reward_func/mean": 0.394390732049942,
      "rewards/semantic_correctness_reward_func/std": 0.19004258513450623,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 534
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 722.0,
      "completions/mean_length": 157.79464721679688,
      "completions/mean_terminated_length": 153.9103240966797,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.18266399214715548,
      "grad_norm": 0.020164046436548233,
      "kl": 0.01853799819946289,
      "learning_rate": 5.099201679633769e-06,
      "loss": 0.0122,
      "num_tokens": 190839637.0,
      "reward": 0.4808293879032135,
      "reward_std": 0.07350355386734009,
      "rewards/gemini_judge_reward_func/mean": 0.1696428507566452,
      "rewards/gemini_judge_reward_func/std": 0.3008103370666504,
      "rewards/semantic_correctness_reward_func/mean": 0.4561111032962799,
      "rewards/semantic_correctness_reward_func/std": 0.250588595867157,
      "rewards/xmlcount_reward_func/mean": 0.8043751120567322,
      "rewards/xmlcount_reward_func/std": 0.39858436584472656,
      "step": 535
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 652.0,
      "completions/mean_length": 162.5982208251953,
      "completions/mean_terminated_length": 138.88990783691406,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.18300542016986043,
      "grad_norm": 0.020656241104006767,
      "kl": 0.015102148056030273,
      "learning_rate": 5.082669723831793e-06,
      "loss": -0.0378,
      "num_tokens": 191234363.0,
      "reward": 0.3954419493675232,
      "reward_std": 0.052222900092601776,
      "rewards/gemini_judge_reward_func/mean": 0.1540178507566452,
      "rewards/gemini_judge_reward_func/std": 0.27251842617988586,
      "rewards/semantic_correctness_reward_func/mean": 0.4179239869117737,
      "rewards/semantic_correctness_reward_func/std": 0.2332126796245575,
      "rewards/xmlcount_reward_func/mean": 0.6256250739097595,
      "rewards/xmlcount_reward_func/std": 0.48569241166114807,
      "step": 536
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 534.0,
      "completions/max_terminated_length": 534.0,
      "completions/mean_length": 143.3482208251953,
      "completions/mean_terminated_length": 143.3482208251953,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.18334684819256541,
      "grad_norm": 0.02267182618379593,
      "kl": 0.017610549926757812,
      "learning_rate": 5.066136863966963e-06,
      "loss": -0.0024,
      "num_tokens": 191593785.0,
      "reward": 0.43893885612487793,
      "reward_std": 0.055210184305906296,
      "rewards/gemini_judge_reward_func/mean": 0.1484375,
      "rewards/gemini_judge_reward_func/std": 0.25169339776039124,
      "rewards/semantic_correctness_reward_func/mean": 0.4320690929889679,
      "rewards/semantic_correctness_reward_func/std": 0.2041468471288681,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 537
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 915.0,
      "completions/max_terminated_length": 915.0,
      "completions/mean_length": 145.40179443359375,
      "completions/mean_terminated_length": 145.40179443359375,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.18368827621527037,
      "grad_norm": 0.023711949586868286,
      "kl": 0.017864704132080078,
      "learning_rate": 5.049603280839982e-06,
      "loss": -0.0087,
      "num_tokens": 191961767.0,
      "reward": 0.4316798150539398,
      "reward_std": 0.07676589488983154,
      "rewards/gemini_judge_reward_func/mean": 0.1584821492433548,
      "rewards/gemini_judge_reward_func/std": 0.2957007586956024,
      "rewards/semantic_correctness_reward_func/mean": 0.4338454306125641,
      "rewards/semantic_correctness_reward_func/std": 0.2067124843597412,
      "rewards/xmlcount_reward_func/mean": 0.7037946581840515,
      "rewards/xmlcount_reward_func/std": 0.4546814262866974,
      "step": 538
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 784.0,
      "completions/mean_length": 166.58929443359375,
      "completions/mean_terminated_length": 147.01368713378906,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.18402970423797532,
      "grad_norm": 0.020993690937757492,
      "kl": 0.015955448150634766,
      "learning_rate": 5.033069155259471e-06,
      "loss": 0.0114,
      "num_tokens": 192308643.0,
      "reward": 0.44476279616355896,
      "reward_std": 0.08589890599250793,
      "rewards/gemini_judge_reward_func/mean": 0.1662946492433548,
      "rewards/gemini_judge_reward_func/std": 0.28207293152809143,
      "rewards/semantic_correctness_reward_func/mean": 0.4254744052886963,
      "rewards/semantic_correctness_reward_func/std": 0.21944975852966309,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 539
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 980.0,
      "completions/mean_length": 161.5982208251953,
      "completions/mean_terminated_length": 141.90867614746094,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.1843711322606803,
      "grad_norm": 0.021806977689266205,
      "kl": 0.018055438995361328,
      "learning_rate": 5.016534668039976e-06,
      "loss": 0.0092,
      "num_tokens": 192682705.0,
      "reward": 0.3981616199016571,
      "reward_std": 0.07369980216026306,
      "rewards/gemini_judge_reward_func/mean": 0.1361607164144516,
      "rewards/gemini_judge_reward_func/std": 0.24234557151794434,
      "rewards/semantic_correctness_reward_func/mean": 0.4368884265422821,
      "rewards/semantic_correctness_reward_func/std": 0.20789456367492676,
      "rewards/xmlcount_reward_func/mean": 0.6407991051673889,
      "rewards/xmlcount_reward_func/std": 0.47512152791023254,
      "step": 540
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 861.0,
      "completions/mean_length": 161.8482208251953,
      "completions/mean_terminated_length": 142.1643829345703,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.18471256028338526,
      "grad_norm": 0.02082211524248123,
      "kl": 0.016353130340576172,
      "learning_rate": 5e-06,
      "loss": 0.0069,
      "num_tokens": 193064427.0,
      "reward": 0.377638041973114,
      "reward_std": 0.048613447695970535,
      "rewards/gemini_judge_reward_func/mean": 0.1294642835855484,
      "rewards/gemini_judge_reward_func/std": 0.24939870834350586,
      "rewards/semantic_correctness_reward_func/mean": 0.434475839138031,
      "rewards/semantic_correctness_reward_func/std": 0.22424183785915375,
      "rewards/xmlcount_reward_func/mean": 0.5973929166793823,
      "rewards/xmlcount_reward_func/std": 0.48817571997642517,
      "step": 541
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 766.0,
      "completions/mean_length": 151.7544708251953,
      "completions/mean_terminated_length": 147.8430633544922,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.1850539883060902,
      "grad_norm": 0.023179858922958374,
      "kl": 0.018024682998657227,
      "learning_rate": 4.983465331960025e-06,
      "loss": -0.034,
      "num_tokens": 193415356.0,
      "reward": 0.4562413692474365,
      "reward_std": 0.06751704216003418,
      "rewards/gemini_judge_reward_func/mean": 0.1294642835855484,
      "rewards/gemini_judge_reward_func/std": 0.24140411615371704,
      "rewards/semantic_correctness_reward_func/mean": 0.4492780268192291,
      "rewards/semantic_correctness_reward_func/std": 0.21333856880664825,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 542
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 295.0,
      "completions/mean_length": 166.71429443359375,
      "completions/mean_terminated_length": 147.14154052734375,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.1853954163287952,
      "grad_norm": 0.019812671467661858,
      "kl": 0.015459537506103516,
      "learning_rate": 4.96693084474053e-06,
      "loss": 0.0041,
      "num_tokens": 193767172.0,
      "reward": 0.4424658417701721,
      "reward_std": 0.07584776729345322,
      "rewards/gemini_judge_reward_func/mean": 0.1361607164144516,
      "rewards/gemini_judge_reward_func/std": 0.2457905411720276,
      "rewards/semantic_correctness_reward_func/mean": 0.4385075569152832,
      "rewards/semantic_correctness_reward_func/std": 0.21686992049217224,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 543
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 503.0,
      "completions/mean_length": 166.4553680419922,
      "completions/mean_terminated_length": 154.81448364257812,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.18573684435150015,
      "grad_norm": 0.020016420632600784,
      "kl": 0.011565446853637695,
      "learning_rate": 4.950396719160019e-06,
      "loss": -0.0232,
      "num_tokens": 194111730.0,
      "reward": 0.4828590750694275,
      "reward_std": 0.06606737524271011,
      "rewards/gemini_judge_reward_func/mean": 0.1428571492433548,
      "rewards/gemini_judge_reward_func/std": 0.27038896083831787,
      "rewards/semantic_correctness_reward_func/mean": 0.43940237164497375,
      "rewards/semantic_correctness_reward_func/std": 0.20458753407001495,
      "rewards/xmlcount_reward_func/mean": 0.844589352607727,
      "rewards/xmlcount_reward_func/std": 0.3580341935157776,
      "step": 544
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 322.0,
      "completions/mean_length": 153.55357360839844,
      "completions/mean_terminated_length": 137.72726440429688,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.1860782723742051,
      "grad_norm": 0.02174554578959942,
      "kl": 0.018509387969970703,
      "learning_rate": 4.93386313603304e-06,
      "loss": -0.0003,
      "num_tokens": 194440186.0,
      "reward": 0.47312822937965393,
      "reward_std": 0.06163394823670387,
      "rewards/gemini_judge_reward_func/mean": 0.1473214328289032,
      "rewards/gemini_judge_reward_func/std": 0.23450541496276855,
      "rewards/semantic_correctness_reward_func/mean": 0.4622481167316437,
      "rewards/semantic_correctness_reward_func/std": 0.23137818276882172,
      "rewards/xmlcount_reward_func/mean": 0.8043750524520874,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 545
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 306.0,
      "completions/mean_length": 155.45089721679688,
      "completions/mean_terminated_length": 135.62100219726562,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.18641970039691008,
      "grad_norm": 0.020509352907538414,
      "kl": 0.016957759857177734,
      "learning_rate": 4.917330276168208e-06,
      "loss": 0.0021,
      "num_tokens": 194803295.0,
      "reward": 0.4079745411872864,
      "reward_std": 0.06811019033193588,
      "rewards/gemini_judge_reward_func/mean": 0.0993303582072258,
      "rewards/gemini_judge_reward_func/std": 0.2218693345785141,
      "rewards/semantic_correctness_reward_func/mean": 0.44696179032325745,
      "rewards/semantic_correctness_reward_func/std": 0.19916366040706635,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903594970703,
      "step": 546
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 394.0,
      "completions/mean_length": 157.58482360839844,
      "completions/mean_terminated_length": 141.83181762695312,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.18676112841961504,
      "grad_norm": 0.022121498361229897,
      "kl": 0.02248358726501465,
      "learning_rate": 4.900798320366233e-06,
      "loss": -0.0047,
      "num_tokens": 195182922.0,
      "reward": 0.4160357713699341,
      "reward_std": 0.05240814387798309,
      "rewards/gemini_judge_reward_func/mean": 0.0926339253783226,
      "rewards/gemini_judge_reward_func/std": 0.2235099822282791,
      "rewards/semantic_correctness_reward_func/mean": 0.42916086316108704,
      "rewards/semantic_correctness_reward_func/std": 0.188674196600914,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 547
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 755.0,
      "completions/mean_length": 156.20982360839844,
      "completions/mean_terminated_length": 148.3918914794922,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.18710255644232002,
      "grad_norm": 0.02068450301885605,
      "kl": 0.018212318420410156,
      "learning_rate": 4.884267449417932e-06,
      "loss": 0.0021,
      "num_tokens": 195525961.0,
      "reward": 0.4415396451950073,
      "reward_std": 0.06090681999921799,
      "rewards/gemini_judge_reward_func/mean": 0.1339285671710968,
      "rewards/gemini_judge_reward_func/std": 0.2342064529657364,
      "rewards/semantic_correctness_reward_func/mean": 0.4383408725261688,
      "rewards/semantic_correctness_reward_func/std": 0.1995237171649933,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 548
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 629.0,
      "completions/mean_length": 149.54464721679688,
      "completions/mean_terminated_length": 145.62332153320312,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.18744398446502497,
      "grad_norm": 0.020279210060834885,
      "kl": 0.014973640441894531,
      "learning_rate": 4.867737844102261e-06,
      "loss": -0.0146,
      "num_tokens": 195887403.0,
      "reward": 0.44515714049339294,
      "reward_std": 0.05280788615345955,
      "rewards/gemini_judge_reward_func/mean": 0.1127232164144516,
      "rewards/gemini_judge_reward_func/std": 0.22175651788711548,
      "rewards/semantic_correctness_reward_func/mean": 0.4273391664028168,
      "rewards/semantic_correctness_reward_func/std": 0.21332186460494995,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 549
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 889.0,
      "completions/mean_length": 166.63839721679688,
      "completions/mean_terminated_length": 155.0,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.18778541248772992,
      "grad_norm": 0.019517678767442703,
      "kl": 0.012271404266357422,
      "learning_rate": 4.851209685184339e-06,
      "loss": 0.0063,
      "num_tokens": 196253170.0,
      "reward": 0.4689396023750305,
      "reward_std": 0.08325158804655075,
      "rewards/gemini_judge_reward_func/mean": 0.1919642835855484,
      "rewards/gemini_judge_reward_func/std": 0.29944294691085815,
      "rewards/semantic_correctness_reward_func/mean": 0.4682067334651947,
      "rewards/semantic_correctness_reward_func/std": 0.24702706933021545,
      "rewards/xmlcount_reward_func/mean": 0.7462812662124634,
      "rewards/xmlcount_reward_func/std": 0.4369716942310333,
      "step": 550
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 356.0,
      "completions/mean_length": 164.35269165039062,
      "completions/mean_terminated_length": 144.7260284423828,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.1881268405104349,
      "grad_norm": 0.021506622433662415,
      "kl": 0.014761686325073242,
      "learning_rate": 4.8346831534134595e-06,
      "loss": -0.0001,
      "num_tokens": 196590861.0,
      "reward": 0.4275391399860382,
      "reward_std": 0.07103725522756577,
      "rewards/gemini_judge_reward_func/mean": 0.1361607164144516,
      "rewards/gemini_judge_reward_func/std": 0.25474947690963745,
      "rewards/semantic_correctness_reward_func/mean": 0.3996242582798004,
      "rewards/semantic_correctness_reward_func/std": 0.20673991739749908,
      "rewards/xmlcount_reward_func/mean": 0.7328750491142273,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 551
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 943.0,
      "completions/mean_length": 156.5178680419922,
      "completions/mean_terminated_length": 144.74208068847656,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.18846826853313986,
      "grad_norm": 0.022670861333608627,
      "kl": 0.017201900482177734,
      "learning_rate": 4.818158429521129e-06,
      "loss": -0.0106,
      "num_tokens": 196951165.0,
      "reward": 0.4059324860572815,
      "reward_std": 0.05520794540643692,
      "rewards/gemini_judge_reward_func/mean": 0.0959821417927742,
      "rewards/gemini_judge_reward_func/std": 0.18865090608596802,
      "rewards/semantic_correctness_reward_func/mean": 0.45956405997276306,
      "rewards/semantic_correctness_reward_func/std": 0.21358434855937958,
      "rewards/xmlcount_reward_func/mean": 0.689067006111145,
      "rewards/xmlcount_reward_func/std": 0.4582628309726715,
      "step": 552
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 758.0,
      "completions/mean_length": 158.15625,
      "completions/mean_terminated_length": 142.41363525390625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.1888096965558448,
      "grad_norm": 0.02014843560755253,
      "kl": 0.015711545944213867,
      "learning_rate": 4.801635694219079e-06,
      "loss": -0.0282,
      "num_tokens": 197304152.0,
      "reward": 0.4128032624721527,
      "reward_std": 0.06284855306148529,
      "rewards/gemini_judge_reward_func/mean": 0.109375,
      "rewards/gemini_judge_reward_func/std": 0.22530809044837952,
      "rewards/semantic_correctness_reward_func/mean": 0.4152660667896271,
      "rewards/semantic_correctness_reward_func/std": 0.2113528996706009,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 553
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 360.0,
      "completions/mean_length": 150.3928680419922,
      "completions/mean_terminated_length": 134.50909423828125,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.1891511245785498,
      "grad_norm": 0.024563709273934364,
      "kl": 0.020460128784179688,
      "learning_rate": 4.785115128197298e-06,
      "loss": -0.0112,
      "num_tokens": 197680252.0,
      "reward": 0.4380243420600891,
      "reward_std": 0.09170064330101013,
      "rewards/gemini_judge_reward_func/mean": 0.1640625,
      "rewards/gemini_judge_reward_func/std": 0.27535709738731384,
      "rewards/semantic_correctness_reward_func/mean": 0.443183958530426,
      "rewards/semantic_correctness_reward_func/std": 0.2359272539615631,
      "rewards/xmlcount_reward_func/mean": 0.709406316280365,
      "rewards/xmlcount_reward_func/std": 0.45243263244628906,
      "step": 554
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 449.0,
      "completions/mean_length": 161.38839721679688,
      "completions/mean_terminated_length": 149.67874145507812,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.18949255260125475,
      "grad_norm": 0.019087178632616997,
      "kl": 0.014681816101074219,
      "learning_rate": 4.768596912122046e-06,
      "loss": -0.031,
      "num_tokens": 198046059.0,
      "reward": 0.43682870268821716,
      "reward_std": 0.052002809941768646,
      "rewards/gemini_judge_reward_func/mean": 0.0770089253783226,
      "rewards/gemini_judge_reward_func/std": 0.17215459048748016,
      "rewards/semantic_correctness_reward_func/mean": 0.4213755428791046,
      "rewards/semantic_correctness_reward_func/std": 0.18203482031822205,
      "rewards/xmlcount_reward_func/mean": 0.8043750524520874,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 555
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.044642857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 698.0,
      "completions/mean_length": 184.30357360839844,
      "completions/mean_terminated_length": 145.06541442871094,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.1898339806239597,
      "grad_norm": 0.019434470683336258,
      "kl": 0.013373613357543945,
      "learning_rate": 4.752081226633888e-06,
      "loss": -0.0156,
      "num_tokens": 198423511.0,
      "reward": 0.3853397071361542,
      "reward_std": 0.05791494622826576,
      "rewards/gemini_judge_reward_func/mean": 0.1450892835855484,
      "rewards/gemini_judge_reward_func/std": 0.2873225510120392,
      "rewards/semantic_correctness_reward_func/mean": 0.45006465911865234,
      "rewards/semantic_correctness_reward_func/std": 0.21861128509044647,
      "rewards/xmlcount_reward_func/mean": 0.5932276844978333,
      "rewards/xmlcount_reward_func/std": 0.49093925952911377,
      "step": 556
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 394.0,
      "completions/mean_length": 165.5,
      "completions/mean_terminated_length": 141.87155151367188,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.19017540864666468,
      "grad_norm": 0.01992631144821644,
      "kl": 0.01584911346435547,
      "learning_rate": 4.735568252345718e-06,
      "loss": -0.019,
      "num_tokens": 198796363.0,
      "reward": 0.3802779018878937,
      "reward_std": 0.050197649747133255,
      "rewards/gemini_judge_reward_func/mean": 0.1194196417927742,
      "rewards/gemini_judge_reward_func/std": 0.23904728889465332,
      "rewards/semantic_correctness_reward_func/mean": 0.43720176815986633,
      "rewards/semantic_correctness_reward_func/std": 0.17973925173282623,
      "rewards/xmlcount_reward_func/mean": 0.6126741766929626,
      "rewards/xmlcount_reward_func/std": 0.48539677262306213,
      "step": 557
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 963.0,
      "completions/mean_length": 157.01339721679688,
      "completions/mean_terminated_length": 145.24435424804688,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.19051683666936964,
      "grad_norm": 0.021687161177396774,
      "kl": 0.020299911499023438,
      "learning_rate": 4.719058169840773e-06,
      "loss": 0.015,
      "num_tokens": 199166094.0,
      "reward": 0.42609551548957825,
      "reward_std": 0.06808494031429291,
      "rewards/gemini_judge_reward_func/mean": 0.1194196417927742,
      "rewards/gemini_judge_reward_func/std": 0.2378719598054886,
      "rewards/semantic_correctness_reward_func/mean": 0.4080309271812439,
      "rewards/semantic_correctness_reward_func/std": 0.2078658640384674,
      "rewards/xmlcount_reward_func/mean": 0.7418035864830017,
      "rewards/xmlcount_reward_func/std": 0.43432918190956116,
      "step": 558
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 848.0,
      "completions/max_terminated_length": 848.0,
      "completions/mean_length": 146.4241180419922,
      "completions/mean_terminated_length": 146.4241180419922,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.1908582646920746,
      "grad_norm": 0.020877644419670105,
      "kl": 0.01645803451538086,
      "learning_rate": 4.702551159670672e-06,
      "loss": -0.0548,
      "num_tokens": 199510785.0,
      "reward": 0.45165494084358215,
      "reward_std": 0.07315388321876526,
      "rewards/gemini_judge_reward_func/mean": 0.1439732164144516,
      "rewards/gemini_judge_reward_func/std": 0.24185720086097717,
      "rewards/semantic_correctness_reward_func/mean": 0.433077871799469,
      "rewards/semantic_correctness_reward_func/std": 0.2338293194770813,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 559
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 304.0,
      "completions/max_terminated_length": 304.0,
      "completions/mean_length": 141.375,
      "completions/mean_terminated_length": 141.375,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.19119969271477957,
      "grad_norm": 0.02385268360376358,
      "kl": 0.014994382858276367,
      "learning_rate": 4.686047402353433e-06,
      "loss": 0.0111,
      "num_tokens": 199862417.0,
      "reward": 0.47651809453964233,
      "reward_std": 0.06831540167331696,
      "rewards/gemini_judge_reward_func/mean": 0.1618303507566452,
      "rewards/gemini_judge_reward_func/std": 0.29813432693481445,
      "rewards/semantic_correctness_reward_func/mean": 0.4501795768737793,
      "rewards/semantic_correctness_reward_func/std": 0.22495129704475403,
      "rewards/xmlcount_reward_func/mean": 0.8043751120567322,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 560
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 675.0,
      "completions/mean_length": 156.93304443359375,
      "completions/mean_terminated_length": 149.1216278076172,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.19154112073748453,
      "grad_norm": 0.022106066346168518,
      "kl": 0.015239715576171875,
      "learning_rate": 4.669547078371503e-06,
      "loss": -0.0075,
      "num_tokens": 200199170.0,
      "reward": 0.4726813733577728,
      "reward_std": 0.06441599130630493,
      "rewards/gemini_judge_reward_func/mean": 0.1674107164144516,
      "rewards/gemini_judge_reward_func/std": 0.26019221544265747,
      "rewards/semantic_correctness_reward_func/mean": 0.4555852711200714,
      "rewards/semantic_correctness_reward_func/std": 0.21719810366630554,
      "rewards/xmlcount_reward_func/mean": 0.786500096321106,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 561
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 322.0,
      "completions/mean_length": 169.24107360839844,
      "completions/mean_terminated_length": 149.7260284423828,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.19188254876018948,
      "grad_norm": 0.019646869972348213,
      "kl": 0.012475013732910156,
      "learning_rate": 4.65305036816978e-06,
      "loss": -0.0089,
      "num_tokens": 200528212.0,
      "reward": 0.4791014790534973,
      "reward_std": 0.06499748677015305,
      "rewards/gemini_judge_reward_func/mean": 0.1417410671710968,
      "rewards/gemini_judge_reward_func/std": 0.24661129713058472,
      "rewards/semantic_correctness_reward_func/mean": 0.4317750036716461,
      "rewards/semantic_correctness_reward_func/std": 0.2213517427444458,
      "rewards/xmlcount_reward_func/mean": 0.8401250839233398,
      "rewards/xmlcount_reward_func/std": 0.3684578835964203,
      "step": 562
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 440.0,
      "completions/mean_length": 168.69644165039062,
      "completions/mean_terminated_length": 141.10598754882812,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.19222397678289446,
      "grad_norm": 0.020795688033103943,
      "kl": 0.012841224670410156,
      "learning_rate": 4.636557452153645e-06,
      "loss": -0.0124,
      "num_tokens": 200886308.0,
      "reward": 0.4488481879234314,
      "reward_std": 0.05293935909867287,
      "rewards/gemini_judge_reward_func/mean": 0.1283482164144516,
      "rewards/gemini_judge_reward_func/std": 0.24715863168239594,
      "rewards/semantic_correctness_reward_func/mean": 0.4502943158149719,
      "rewards/semantic_correctness_reward_func/std": 0.23604029417037964,
      "rewards/xmlcount_reward_func/mean": 0.7686249613761902,
      "rewards/xmlcount_reward_func/std": 0.42356839776039124,
      "step": 563
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 281.0,
      "completions/mean_length": 151.6116180419922,
      "completions/mean_terminated_length": 143.75225830078125,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.19256540480559942,
      "grad_norm": 0.020217539742588997,
      "kl": 0.01593160629272461,
      "learning_rate": 4.620068510686985e-06,
      "loss": -0.0059,
      "num_tokens": 201230369.0,
      "reward": 0.453321635723114,
      "reward_std": 0.06796573847532272,
      "rewards/gemini_judge_reward_func/mean": 0.1372767835855484,
      "rewards/gemini_judge_reward_func/std": 0.24459391832351685,
      "rewards/semantic_correctness_reward_func/mean": 0.4190545082092285,
      "rewards/semantic_correctness_reward_func/std": 0.19237849116325378,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 564
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 656.0,
      "completions/mean_length": 170.2678680419922,
      "completions/mean_terminated_length": 142.7281036376953,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.19290683282830437,
      "grad_norm": 0.0209835022687912,
      "kl": 0.01555776596069336,
      "learning_rate": 4.60358372409022e-06,
      "loss": -0.0456,
      "num_tokens": 201595869.0,
      "reward": 0.39094409346580505,
      "reward_std": 0.0595240518450737,
      "rewards/gemini_judge_reward_func/mean": 0.1037946417927742,
      "rewards/gemini_judge_reward_func/std": 0.22234255075454712,
      "rewards/semantic_correctness_reward_func/mean": 0.42662209272384644,
      "rewards/semantic_correctness_reward_func/std": 0.2272646129131317,
      "rewards/xmlcount_reward_func/mean": 0.6602544784545898,
      "rewards/xmlcount_reward_func/std": 0.4744928777217865,
      "step": 565
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.044642857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 985.0,
      "completions/mean_length": 194.9419708251953,
      "completions/mean_terminated_length": 156.200927734375,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.19324826085100935,
      "grad_norm": 0.021231811493635178,
      "kl": 0.014326095581054688,
      "learning_rate": 4.587103272638339e-06,
      "loss": -0.0311,
      "num_tokens": 201987592.0,
      "reward": 0.4055339992046356,
      "reward_std": 0.05433611199259758,
      "rewards/gemini_judge_reward_func/mean": 0.1127232164144516,
      "rewards/gemini_judge_reward_func/std": 0.2480076402425766,
      "rewards/semantic_correctness_reward_func/mean": 0.44372352957725525,
      "rewards/semantic_correctness_reward_func/std": 0.21770647168159485,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 566
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 299.0,
      "completions/max_terminated_length": 299.0,
      "completions/mean_length": 145.13394165039062,
      "completions/mean_terminated_length": 145.13394165039062,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.1935896888737143,
      "grad_norm": 0.02045750990509987,
      "kl": 0.013641357421875,
      "learning_rate": 4.570627336558915e-06,
      "loss": 0.0053,
      "num_tokens": 202330202.0,
      "reward": 0.45321542024612427,
      "reward_std": 0.056673984974622726,
      "rewards/gemini_judge_reward_func/mean": 0.1328125,
      "rewards/gemini_judge_reward_func/std": 0.23184122145175934,
      "rewards/semantic_correctness_reward_func/mean": 0.4274519979953766,
      "rewards/semantic_correctness_reward_func/std": 0.22571316361427307,
      "rewards/xmlcount_reward_func/mean": 0.786500096321106,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 567
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 343.0,
      "completions/mean_length": 154.75,
      "completions/mean_terminated_length": 146.91893005371094,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.1939311168964193,
      "grad_norm": 0.022775255143642426,
      "kl": 0.01674032211303711,
      "learning_rate": 4.554156096030149e-06,
      "loss": -0.0144,
      "num_tokens": 202703858.0,
      "reward": 0.43068042397499084,
      "reward_std": 0.055178917944431305,
      "rewards/gemini_judge_reward_func/mean": 0.1261160671710968,
      "rewards/gemini_judge_reward_func/std": 0.25167351961135864,
      "rewards/semantic_correctness_reward_func/mean": 0.4555181860923767,
      "rewards/semantic_correctness_reward_func/std": 0.2062966674566269,
      "rewards/xmlcount_reward_func/mean": 0.7228259444236755,
      "rewards/xmlcount_reward_func/std": 0.4447035491466522,
      "step": 568
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 398.0,
      "completions/mean_length": 152.7544708251953,
      "completions/mean_terminated_length": 144.90541076660156,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.19427254491912424,
      "grad_norm": 0.02390287257730961,
      "kl": 0.0160367488861084,
      "learning_rate": 4.537689731178883e-06,
      "loss": -0.0266,
      "num_tokens": 203091631.0,
      "reward": 0.4275432527065277,
      "reward_std": 0.07999604940414429,
      "rewards/gemini_judge_reward_func/mean": 0.1584821492433548,
      "rewards/gemini_judge_reward_func/std": 0.2709713876247406,
      "rewards/semantic_correctness_reward_func/mean": 0.3907518982887268,
      "rewards/semantic_correctness_reward_func/std": 0.20916995406150818,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 569
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 326.0,
      "completions/mean_length": 154.55357360839844,
      "completions/mean_terminated_length": 142.7511444091797,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.1946139729418292,
      "grad_norm": 0.022180769592523575,
      "kl": 0.014091014862060547,
      "learning_rate": 4.5212284220786495e-06,
      "loss": -0.034,
      "num_tokens": 203441075.0,
      "reward": 0.4240075349807739,
      "reward_std": 0.05866669490933418,
      "rewards/gemini_judge_reward_func/mean": 0.0959821417927742,
      "rewards/gemini_judge_reward_func/std": 0.19450274109840393,
      "rewards/semantic_correctness_reward_func/mean": 0.4265732169151306,
      "rewards/semantic_correctness_reward_func/std": 0.18987764418125153,
      "rewards/xmlcount_reward_func/mean": 0.7507500648498535,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 570
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 283.0,
      "completions/max_terminated_length": 283.0,
      "completions/mean_length": 139.49107360839844,
      "completions/mean_terminated_length": 139.49107360839844,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.19495540096453418,
      "grad_norm": 0.0199284665286541,
      "kl": 0.013919830322265625,
      "learning_rate": 4.504772348747687e-06,
      "loss": -0.0205,
      "num_tokens": 203810761.0,
      "reward": 0.4262349307537079,
      "reward_std": 0.07754707336425781,
      "rewards/gemini_judge_reward_func/mean": 0.1495535671710968,
      "rewards/gemini_judge_reward_func/std": 0.2515864968299866,
      "rewards/semantic_correctness_reward_func/mean": 0.45570138096809387,
      "rewards/semantic_correctness_reward_func/std": 0.2322954386472702,
      "rewards/xmlcount_reward_func/mean": 0.6881831288337708,
      "rewards/xmlcount_reward_func/std": 0.4650120735168457,
      "step": 571
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 611.0,
      "completions/mean_length": 160.74107360839844,
      "completions/mean_terminated_length": 145.0454559326172,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.19529682898723913,
      "grad_norm": 0.020155394449830055,
      "kl": 0.016458988189697266,
      "learning_rate": 4.488321691146975e-06,
      "loss": 0.0005,
      "num_tokens": 204169107.0,
      "reward": 0.4155966341495514,
      "reward_std": 0.05478544905781746,
      "rewards/gemini_judge_reward_func/mean": 0.1194196417927742,
      "rewards/gemini_judge_reward_func/std": 0.22453764081001282,
      "rewards/semantic_correctness_reward_func/mean": 0.40914371609687805,
      "rewards/semantic_correctness_reward_func/std": 0.1803514063358307,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 572
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 267.0,
      "completions/mean_length": 145.82144165039062,
      "completions/mean_terminated_length": 137.909912109375,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.19563825700994408,
      "grad_norm": 0.021136289462447166,
      "kl": 0.021631717681884766,
      "learning_rate": 4.471876629178273e-06,
      "loss": -0.0114,
      "num_tokens": 204491755.0,
      "reward": 0.451036274433136,
      "reward_std": 0.06088856980204582,
      "rewards/gemini_judge_reward_func/mean": 0.1372767835855484,
      "rewards/gemini_judge_reward_func/std": 0.24459391832351685,
      "rewards/semantic_correctness_reward_func/mean": 0.43219009041786194,
      "rewards/semantic_correctness_reward_func/std": 0.2034158557653427,
      "rewards/xmlcount_reward_func/mean": 0.7742188572883606,
      "rewards/xmlcount_reward_func/std": 0.4162629544734955,
      "step": 573
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 370.0,
      "completions/mean_length": 163.20982360839844,
      "completions/mean_terminated_length": 147.55908203125,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.19597968503264906,
      "grad_norm": 0.02188599295914173,
      "kl": 0.01761460304260254,
      "learning_rate": 4.4554373426821375e-06,
      "loss": 0.0214,
      "num_tokens": 204838910.0,
      "reward": 0.4334542155265808,
      "reward_std": 0.06716626137495041,
      "rewards/gemini_judge_reward_func/mean": 0.1305803507566452,
      "rewards/gemini_judge_reward_func/std": 0.2207612693309784,
      "rewards/semantic_correctness_reward_func/mean": 0.47611019015312195,
      "rewards/semantic_correctness_reward_func/std": 0.19524888694286346,
      "rewards/xmlcount_reward_func/mean": 0.7150000929832458,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 574
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 670.0,
      "completions/mean_length": 152.5178680419922,
      "completions/mean_terminated_length": 148.60987854003906,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.19632111305535402,
      "grad_norm": 0.027467962354421616,
      "kl": 0.02365851402282715,
      "learning_rate": 4.439004011435979e-06,
      "loss": 0.0124,
      "num_tokens": 205200686.0,
      "reward": 0.4516783654689789,
      "reward_std": 0.050351765006780624,
      "rewards/gemini_judge_reward_func/mean": 0.1651785671710968,
      "rewards/gemini_judge_reward_func/std": 0.3023702800273895,
      "rewards/semantic_correctness_reward_func/mean": 0.42674875259399414,
      "rewards/semantic_correctness_reward_func/std": 0.2154146283864975,
      "rewards/xmlcount_reward_func/mean": 0.7506428956985474,
      "rewards/xmlcount_reward_func/std": 0.4343574345111847,
      "step": 575
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 570.0,
      "completions/mean_length": 150.6607208251953,
      "completions/mean_terminated_length": 142.7928009033203,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.19666254107805897,
      "grad_norm": 0.021642692387104034,
      "kl": 0.014559745788574219,
      "learning_rate": 4.42257681515207e-06,
      "loss": 0.0003,
      "num_tokens": 205540602.0,
      "reward": 0.4592445492744446,
      "reward_std": 0.07840275019407272,
      "rewards/gemini_judge_reward_func/mean": 0.1305803507566452,
      "rewards/gemini_judge_reward_func/std": 0.24021686613559723,
      "rewards/semantic_correctness_reward_func/mean": 0.4620618224143982,
      "rewards/semantic_correctness_reward_func/std": 0.2147509902715683,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 576
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 365.0,
      "completions/mean_length": 144.47769165039062,
      "completions/mean_terminated_length": 136.55406188964844,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.19700396910076395,
      "grad_norm": 0.02344970405101776,
      "kl": 0.02066946029663086,
      "learning_rate": 4.406155933475599e-06,
      "loss": -0.0091,
      "num_tokens": 205913557.0,
      "reward": 0.4101777970790863,
      "reward_std": 0.07215236127376556,
      "rewards/gemini_judge_reward_func/mean": 0.1417410671710968,
      "rewards/gemini_judge_reward_func/std": 0.24887388944625854,
      "rewards/semantic_correctness_reward_func/mean": 0.4446566700935364,
      "rewards/semantic_correctness_reward_func/std": 0.19075852632522583,
      "rewards/xmlcount_reward_func/mean": 0.6613750457763672,
      "rewards/xmlcount_reward_func/std": 0.47500187158584595,
      "step": 577
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1005.0,
      "completions/mean_length": 166.50894165039062,
      "completions/mean_terminated_length": 146.93150329589844,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.1973453971234689,
      "grad_norm": 0.02164464257657528,
      "kl": 0.022466421127319336,
      "learning_rate": 4.3897415459827e-06,
      "loss": -0.0295,
      "num_tokens": 206277067.0,
      "reward": 0.40516912937164307,
      "reward_std": 0.06767025589942932,
      "rewards/gemini_judge_reward_func/mean": 0.1037946417927742,
      "rewards/gemini_judge_reward_func/std": 0.22359952330589294,
      "rewards/semantic_correctness_reward_func/mean": 0.45975613594055176,
      "rewards/semantic_correctness_reward_func/std": 0.20172879099845886,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 578
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 165.875,
      "completions/mean_terminated_length": 146.28309631347656,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.19768682514617386,
      "grad_norm": 0.02861608937382698,
      "kl": 0.021225690841674805,
      "learning_rate": 4.373333832178478e-06,
      "loss": -0.035,
      "num_tokens": 206632675.0,
      "reward": 0.39243432879447937,
      "reward_std": 0.04971366375684738,
      "rewards/gemini_judge_reward_func/mean": 0.0904017835855484,
      "rewards/gemini_judge_reward_func/std": 0.20755748450756073,
      "rewards/semantic_correctness_reward_func/mean": 0.422868013381958,
      "rewards/semantic_correctness_reward_func/std": 0.21492131054401398,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 579
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 941.0,
      "completions/mean_length": 159.9107208251953,
      "completions/mean_terminated_length": 152.12612915039062,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.19802825316887884,
      "grad_norm": 0.023097701370716095,
      "kl": 0.014588117599487305,
      "learning_rate": 4.356932971495071e-06,
      "loss": -0.0088,
      "num_tokens": 207001439.0,
      "reward": 0.4645816385746002,
      "reward_std": 0.06817057728767395,
      "rewards/gemini_judge_reward_func/mean": 0.1339285671710968,
      "rewards/gemini_judge_reward_func/std": 0.24588216841220856,
      "rewards/semantic_correctness_reward_func/mean": 0.4820510447025299,
      "rewards/semantic_correctness_reward_func/std": 0.20867471396923065,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 580
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 834.0,
      "completions/mean_length": 160.1428680419922,
      "completions/mean_terminated_length": 148.41629028320312,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.1983696811915838,
      "grad_norm": 0.02026437781751156,
      "kl": 0.019693374633789062,
      "learning_rate": 4.340539143289655e-06,
      "loss": -0.0289,
      "num_tokens": 207373963.0,
      "reward": 0.4335937798023224,
      "reward_std": 0.06907905638217926,
      "rewards/gemini_judge_reward_func/mean": 0.1428571492433548,
      "rewards/gemini_judge_reward_func/std": 0.24879594147205353,
      "rewards/semantic_correctness_reward_func/mean": 0.4522543251514435,
      "rewards/semantic_correctness_reward_func/std": 0.19972975552082062,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 581
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 795.0,
      "completions/mean_length": 157.0178680419922,
      "completions/mean_terminated_length": 149.20721435546875,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.19871110921428875,
      "grad_norm": 0.020305583253502846,
      "kl": 0.014866828918457031,
      "learning_rate": 4.324152526842517e-06,
      "loss": -0.0239,
      "num_tokens": 207735511.0,
      "reward": 0.5174515247344971,
      "reward_std": 0.08982065320014954,
      "rewards/gemini_judge_reward_func/mean": 0.1941964328289032,
      "rewards/gemini_judge_reward_func/std": 0.2894052565097809,
      "rewards/semantic_correctness_reward_func/mean": 0.5186145901679993,
      "rewards/semantic_correctness_reward_func/std": 0.22194412350654602,
      "rewards/xmlcount_reward_func/mean": 0.8401250839233398,
      "rewards/xmlcount_reward_func/std": 0.3684578835964203,
      "step": 582
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 289.0,
      "completions/mean_length": 162.7678680419922,
      "completions/mean_terminated_length": 147.1090850830078,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.19905253723699373,
      "grad_norm": 0.020116182044148445,
      "kl": 0.017856121063232422,
      "learning_rate": 4.307773301355063e-06,
      "loss": -0.0238,
      "num_tokens": 208069891.0,
      "reward": 0.45708727836608887,
      "reward_std": 0.061787575483322144,
      "rewards/gemini_judge_reward_func/mean": 0.1328125,
      "rewards/gemini_judge_reward_func/std": 0.25487470626831055,
      "rewards/semantic_correctness_reward_func/mean": 0.4468111991882324,
      "rewards/semantic_correctness_reward_func/std": 0.21997879445552826,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 583
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 328.0,
      "completions/mean_length": 148.85714721679688,
      "completions/mean_terminated_length": 144.9327392578125,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.19939396525969869,
      "grad_norm": 0.020018786191940308,
      "kl": 0.01651740074157715,
      "learning_rate": 4.291401645947879e-06,
      "loss": 0.0001,
      "num_tokens": 208416219.0,
      "reward": 0.43855661153793335,
      "reward_std": 0.05887799337506294,
      "rewards/gemini_judge_reward_func/mean": 0.125,
      "rewards/gemini_judge_reward_func/std": 0.23437733948230743,
      "rewards/semantic_correctness_reward_func/mean": 0.4412831664085388,
      "rewards/semantic_correctness_reward_func/std": 0.20886096358299255,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 584
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 683.0,
      "completions/mean_length": 158.47769165039062,
      "completions/mean_terminated_length": 142.74090576171875,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.19973539328240367,
      "grad_norm": 0.01968037523329258,
      "kl": 0.013400793075561523,
      "learning_rate": 4.275037739658771e-06,
      "loss": -0.0413,
      "num_tokens": 208759250.0,
      "reward": 0.41233882308006287,
      "reward_std": 0.06944146752357483,
      "rewards/gemini_judge_reward_func/mean": 0.109375,
      "rewards/gemini_judge_reward_func/std": 0.22654861211776733,
      "rewards/semantic_correctness_reward_func/mean": 0.41294386982917786,
      "rewards/semantic_correctness_reward_func/std": 0.21011611819267273,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 585
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 787.0,
      "completions/mean_length": 163.94644165039062,
      "completions/mean_terminated_length": 148.30908203125,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.20007682130510862,
      "grad_norm": 0.020078567788004875,
      "kl": 0.016144275665283203,
      "learning_rate": 4.25868176144079e-06,
      "loss": -0.0363,
      "num_tokens": 209127002.0,
      "reward": 0.3943597078323364,
      "reward_std": 0.06699073314666748,
      "rewards/gemini_judge_reward_func/mean": 0.1283482164144516,
      "rewards/gemini_judge_reward_func/std": 0.23073764145374298,
      "rewards/semantic_correctness_reward_func/mean": 0.4281020760536194,
      "rewards/semantic_correctness_reward_func/std": 0.186894953250885,
      "rewards/xmlcount_reward_func/mean": 0.643500030040741,
      "rewards/xmlcount_reward_func/std": 0.48071083426475525,
      "step": 586
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 367.0,
      "completions/mean_length": 158.75,
      "completions/mean_terminated_length": 147.00453186035156,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.20041824932781357,
      "grad_norm": 0.01927877590060234,
      "kl": 0.015355587005615234,
      "learning_rate": 4.242333890160299e-06,
      "loss": -0.0106,
      "num_tokens": 209469686.0,
      "reward": 0.43422627449035645,
      "reward_std": 0.05323384702205658,
      "rewards/gemini_judge_reward_func/mean": 0.0982142835855484,
      "rewards/gemini_judge_reward_func/std": 0.1844794601202011,
      "rewards/semantic_correctness_reward_func/mean": 0.4017024338245392,
      "rewards/semantic_correctness_reward_func/std": 0.2130288928747177,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 587
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1021.0,
      "completions/mean_length": 157.25,
      "completions/mean_terminated_length": 145.4841766357422,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.20075967735051856,
      "grad_norm": 0.022314567118883133,
      "kl": 0.02434396743774414,
      "learning_rate": 4.225994304594994e-06,
      "loss": 0.0151,
      "num_tokens": 209832814.0,
      "reward": 0.4395473003387451,
      "reward_std": 0.06388600915670395,
      "rewards/gemini_judge_reward_func/mean": 0.1540178507566452,
      "rewards/gemini_judge_reward_func/std": 0.29044100642204285,
      "rewards/semantic_correctness_reward_func/mean": 0.4597005844116211,
      "rewards/semantic_correctness_reward_func/std": 0.22340041399002075,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 588
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 285.0,
      "completions/mean_length": 154.54464721679688,
      "completions/mean_terminated_length": 138.73635864257812,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.2011011053732235,
      "grad_norm": 0.022513817995786667,
      "kl": 0.018211841583251953,
      "learning_rate": 4.209663183431969e-06,
      "loss": -0.0298,
      "num_tokens": 210184528.0,
      "reward": 0.4027530252933502,
      "reward_std": 0.05283074453473091,
      "rewards/gemini_judge_reward_func/mean": 0.0959821417927742,
      "rewards/gemini_judge_reward_func/std": 0.20018360018730164,
      "rewards/semantic_correctness_reward_func/mean": 0.3873276114463806,
      "rewards/semantic_correctness_reward_func/std": 0.18129631876945496,
      "rewards/xmlcount_reward_func/mean": 0.7172366380691528,
      "rewards/xmlcount_reward_func/std": 0.45090451836586,
      "step": 589
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 929.0,
      "completions/mean_length": 161.55804443359375,
      "completions/mean_terminated_length": 157.6905975341797,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.20144253339592846,
      "grad_norm": 0.02004176191985607,
      "kl": 0.014394760131835938,
      "learning_rate": 4.193340705265746e-06,
      "loss": -0.0408,
      "num_tokens": 210528789.0,
      "reward": 0.45304736495018005,
      "reward_std": 0.059848908334970474,
      "rewards/gemini_judge_reward_func/mean": 0.1272321492433548,
      "rewards/gemini_judge_reward_func/std": 0.23075121641159058,
      "rewards/semantic_correctness_reward_func/mean": 0.43714746832847595,
      "rewards/semantic_correctness_reward_func/std": 0.2098701447248459,
      "rewards/xmlcount_reward_func/mean": 0.7868126034736633,
      "rewards/xmlcount_reward_func/std": 0.4040831923484802,
      "step": 590
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 259.0,
      "completions/mean_length": 150.5491180419922,
      "completions/mean_terminated_length": 138.6923065185547,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.20178396141863345,
      "grad_norm": 0.022640299052000046,
      "kl": 0.01974773406982422,
      "learning_rate": 4.17702704859633e-06,
      "loss": -0.0005,
      "num_tokens": 210894360.0,
      "reward": 0.4660325348377228,
      "reward_std": 0.05717170611023903,
      "rewards/gemini_judge_reward_func/mean": 0.1707589328289032,
      "rewards/gemini_judge_reward_func/std": 0.26177191734313965,
      "rewards/semantic_correctness_reward_func/mean": 0.456332266330719,
      "rewards/semantic_correctness_reward_func/std": 0.23281851410865784,
      "rewards/xmlcount_reward_func/mean": 0.7661563158035278,
      "rewards/xmlcount_reward_func/std": 0.4234122037887573,
      "step": 591
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 913.0,
      "completions/mean_length": 166.65179443359375,
      "completions/mean_terminated_length": 147.07762145996094,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.2021253894413384,
      "grad_norm": 0.022002065554261208,
      "kl": 0.017216205596923828,
      "learning_rate": 4.160722391827262e-06,
      "loss": -0.0401,
      "num_tokens": 211262166.0,
      "reward": 0.44295012950897217,
      "reward_std": 0.08097635209560394,
      "rewards/gemini_judge_reward_func/mean": 0.1629464328289032,
      "rewards/gemini_judge_reward_func/std": 0.28055402636528015,
      "rewards/semantic_correctness_reward_func/mean": 0.46779510378837585,
      "rewards/semantic_correctness_reward_func/std": 0.21671941876411438,
      "rewards/xmlcount_reward_func/mean": 0.7105312943458557,
      "rewards/xmlcount_reward_func/std": 0.4553159773349762,
      "step": 592
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 768.0,
      "completions/mean_length": 167.16964721679688,
      "completions/mean_terminated_length": 151.59091186523438,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.20246681746404335,
      "grad_norm": 0.020169679075479507,
      "kl": 0.0156557559967041,
      "learning_rate": 4.14442691326365e-06,
      "loss": -0.0042,
      "num_tokens": 211614188.0,
      "reward": 0.4513782262802124,
      "reward_std": 0.06877769529819489,
      "rewards/gemini_judge_reward_func/mean": 0.1763392835855484,
      "rewards/gemini_judge_reward_func/std": 0.3034524619579315,
      "rewards/semantic_correctness_reward_func/mean": 0.43846240639686584,
      "rewards/semantic_correctness_reward_func/std": 0.20841853320598602,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 593
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 482.0,
      "completions/mean_length": 164.9241180419922,
      "completions/mean_terminated_length": 137.21197509765625,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.20280824548674833,
      "grad_norm": 0.01954479329288006,
      "kl": 0.018438100814819336,
      "learning_rate": 4.128140791110243e-06,
      "loss": -0.0154,
      "num_tokens": 211984707.0,
      "reward": 0.4166203439235687,
      "reward_std": 0.059841644018888474,
      "rewards/gemini_judge_reward_func/mean": 0.1316964328289032,
      "rewards/gemini_judge_reward_func/std": 0.23784302175045013,
      "rewards/semantic_correctness_reward_func/mean": 0.4254585802555084,
      "rewards/semantic_correctness_reward_func/std": 0.21392248570919037,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903594970703,
      "step": 594
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 380.0,
      "completions/mean_length": 160.84375,
      "completions/mean_terminated_length": 141.13697814941406,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.2031496735094533,
      "grad_norm": 0.02386937290430069,
      "kl": 0.017685413360595703,
      "learning_rate": 4.111864203469457e-06,
      "loss": 0.0021,
      "num_tokens": 212359916.0,
      "reward": 0.4186449646949768,
      "reward_std": 0.07048141211271286,
      "rewards/gemini_judge_reward_func/mean": 0.15625,
      "rewards/gemini_judge_reward_func/std": 0.2691645324230194,
      "rewards/semantic_correctness_reward_func/mean": 0.4312067925930023,
      "rewards/semantic_correctness_reward_func/std": 0.18796810507774353,
      "rewards/xmlcount_reward_func/mean": 0.6747589111328125,
      "rewards/xmlcount_reward_func/std": 0.4678308665752411,
      "step": 595
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 970.0,
      "completions/mean_length": 163.3125,
      "completions/mean_terminated_length": 151.62896728515625,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.20349110153215824,
      "grad_norm": 0.020488321781158447,
      "kl": 0.017343997955322266,
      "learning_rate": 4.0955973283394525e-06,
      "loss": -0.029,
      "num_tokens": 212738934.0,
      "reward": 0.43855446577072144,
      "reward_std": 0.0777134820818901,
      "rewards/gemini_judge_reward_func/mean": 0.1785714328289032,
      "rewards/gemini_judge_reward_func/std": 0.27494102716445923,
      "rewards/semantic_correctness_reward_func/mean": 0.4413793087005615,
      "rewards/semantic_correctness_reward_func/std": 0.2333817183971405,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903892993927,
      "step": 596
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 253.0,
      "completions/mean_length": 158.07589721679688,
      "completions/mean_terminated_length": 146.32127380371094,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.20383252955486322,
      "grad_norm": 0.019988469779491425,
      "kl": 0.014391899108886719,
      "learning_rate": 4.079340343612165e-06,
      "loss": -0.0016,
      "num_tokens": 213079815.0,
      "reward": 0.4672608971595764,
      "reward_std": 0.06315002590417862,
      "rewards/gemini_judge_reward_func/mean": 0.1261160671710968,
      "rewards/gemini_judge_reward_func/std": 0.2483101785182953,
      "rewards/semantic_correctness_reward_func/mean": 0.4395720660686493,
      "rewards/semantic_correctness_reward_func/std": 0.2068413943052292,
      "rewards/xmlcount_reward_func/mean": 0.8222500681877136,
      "rewards/xmlcount_reward_func/std": 0.3842346966266632,
      "step": 597
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 347.0,
      "completions/mean_length": 145.5491180419922,
      "completions/mean_terminated_length": 141.60987854003906,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.20417395757756818,
      "grad_norm": 0.021274050697684288,
      "kl": 0.01706838607788086,
      "learning_rate": 4.063093427071376e-06,
      "loss": -0.0041,
      "num_tokens": 213424574.0,
      "reward": 0.40272268652915955,
      "reward_std": 0.05240607634186745,
      "rewards/gemini_judge_reward_func/mean": 0.1216517835855484,
      "rewards/gemini_judge_reward_func/std": 0.23194913566112518,
      "rewards/semantic_correctness_reward_func/mean": 0.41180965304374695,
      "rewards/semantic_correctness_reward_func/std": 0.20743775367736816,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 598
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 326.0,
      "completions/mean_length": 164.43304443359375,
      "completions/mean_terminated_length": 148.80453491210938,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.20451538560027313,
      "grad_norm": 0.021274050697684288,
      "kl": 0.014577627182006836,
      "learning_rate": 4.063093427071376e-06,
      "loss": -0.0257,
      "num_tokens": 213774683.0,
      "reward": 0.458513081073761,
      "reward_std": 0.07192227244377136,
      "rewards/gemini_judge_reward_func/mean": 0.1037946417927742,
      "rewards/gemini_judge_reward_func/std": 0.2507770359516144,
      "rewards/semantic_correctness_reward_func/mean": 0.440475732088089,
      "rewards/semantic_correctness_reward_func/std": 0.21808552742004395,
      "rewards/xmlcount_reward_func/mean": 0.8222500681877136,
      "rewards/xmlcount_reward_func/std": 0.3842346966266632,
      "step": 599
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 653.0,
      "completions/mean_length": 142.44644165039062,
      "completions/mean_terminated_length": 134.50450134277344,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.2048568136229781,
      "grad_norm": 0.02189001813530922,
      "kl": 0.018033504486083984,
      "learning_rate": 4.046856756390767e-06,
      "loss": -0.009,
      "num_tokens": 214127431.0,
      "reward": 0.4390813112258911,
      "reward_std": 0.08607413619756699,
      "rewards/gemini_judge_reward_func/mean": 0.1852678507566452,
      "rewards/gemini_judge_reward_func/std": 0.2980608642101288,
      "rewards/semantic_correctness_reward_func/mean": 0.4807279407978058,
      "rewards/semantic_correctness_reward_func/std": 0.22973637282848358,
      "rewards/xmlcount_reward_func/mean": 0.6720714569091797,
      "rewards/xmlcount_reward_func/std": 0.46528491377830505,
      "step": 600
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 911.0,
      "completions/mean_length": 139.8482208251953,
      "completions/mean_terminated_length": 135.88340759277344,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.20519824164568307,
      "grad_norm": 0.02204316481947899,
      "kl": 0.017858505249023438,
      "learning_rate": 4.03063050913196e-06,
      "loss": -0.0078,
      "num_tokens": 214486013.0,
      "reward": 0.38387420773506165,
      "reward_std": 0.047508224844932556,
      "rewards/gemini_judge_reward_func/mean": 0.1238839253783226,
      "rewards/gemini_judge_reward_func/std": 0.23675422370433807,
      "rewards/semantic_correctness_reward_func/mean": 0.43824586272239685,
      "rewards/semantic_correctness_reward_func/std": 0.20813381671905518,
      "rewards/xmlcount_reward_func/mean": 0.6166786551475525,
      "rewards/xmlcount_reward_func/std": 0.4832932651042938,
      "step": 601
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 277.0,
      "completions/mean_length": 168.6741180419922,
      "completions/mean_terminated_length": 145.13302612304688,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.20553966966838802,
      "grad_norm": 0.01992730423808098,
      "kl": 0.016730308532714844,
      "learning_rate": 4.0144148627426e-06,
      "loss": -0.0043,
      "num_tokens": 214836156.0,
      "reward": 0.44369426369667053,
      "reward_std": 0.05480070784687996,
      "rewards/gemini_judge_reward_func/mean": 0.1316964328289032,
      "rewards/gemini_judge_reward_func/std": 0.24709029495716095,
      "rewards/semantic_correctness_reward_func/mean": 0.42489978671073914,
      "rewards/semantic_correctness_reward_func/std": 0.19661790132522583,
      "rewards/xmlcount_reward_func/mean": 0.7650893330574036,
      "rewards/xmlcount_reward_func/std": 0.423270583152771,
      "step": 602
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 672.0,
      "completions/mean_length": 172.0982208251953,
      "completions/mean_terminated_length": 152.6483917236328,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.205881097691093,
      "grad_norm": 0.021027710288763046,
      "kl": 0.01509237289428711,
      "learning_rate": 3.998209994554395e-06,
      "loss": 0.0116,
      "num_tokens": 215171166.0,
      "reward": 0.45906171202659607,
      "reward_std": 0.0753428265452385,
      "rewards/gemini_judge_reward_func/mean": 0.15625,
      "rewards/gemini_judge_reward_func/std": 0.28138232231140137,
      "rewards/semantic_correctness_reward_func/mean": 0.4455583989620209,
      "rewards/semantic_correctness_reward_func/std": 0.22236627340316772,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 603
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 969.0,
      "completions/mean_length": 178.75894165039062,
      "completions/mean_terminated_length": 151.4930877685547,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.20622252571379796,
      "grad_norm": 0.020675111562013626,
      "kl": 0.017548084259033203,
      "learning_rate": 3.982016081781189e-06,
      "loss": -0.0197,
      "num_tokens": 215552816.0,
      "reward": 0.3967207670211792,
      "reward_std": 0.06549742072820663,
      "rewards/gemini_judge_reward_func/mean": 0.1049107164144516,
      "rewards/gemini_judge_reward_func/std": 0.22495238482952118,
      "rewards/semantic_correctness_reward_func/mean": 0.45103222131729126,
      "rewards/semantic_correctness_reward_func/std": 0.18998552858829498,
      "rewards/xmlcount_reward_func/mean": 0.6613750457763672,
      "rewards/xmlcount_reward_func/std": 0.47500187158584595,
      "step": 604
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 719.0,
      "completions/mean_length": 154.8303680419922,
      "completions/mean_terminated_length": 147.0,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.20656395373650294,
      "grad_norm": 0.021269556134939194,
      "kl": 0.01663351058959961,
      "learning_rate": 3.965833301517017e-06,
      "loss": 0.0136,
      "num_tokens": 215937378.0,
      "reward": 0.4003709554672241,
      "reward_std": 0.07181476801633835,
      "rewards/gemini_judge_reward_func/mean": 0.1350446492433548,
      "rewards/gemini_judge_reward_func/std": 0.22932343184947968,
      "rewards/semantic_correctness_reward_func/mean": 0.43586355447769165,
      "rewards/semantic_correctness_reward_func/std": 0.20265518128871918,
      "rewards/xmlcount_reward_func/mean": 0.6479509472846985,
      "rewards/xmlcount_reward_func/std": 0.47591251134872437,
      "step": 605
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 852.0,
      "completions/mean_length": 166.24554443359375,
      "completions/mean_terminated_length": 150.64999389648438,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.2069053817592079,
      "grad_norm": 0.02032295987010002,
      "kl": 0.016942501068115234,
      "learning_rate": 3.949661830734172e-06,
      "loss": 0.0044,
      "num_tokens": 216288897.0,
      "reward": 0.4458765387535095,
      "reward_std": 0.07394890487194061,
      "rewards/gemini_judge_reward_func/mean": 0.1339285671710968,
      "rewards/gemini_judge_reward_func/std": 0.25919973850250244,
      "rewards/semantic_correctness_reward_func/mean": 0.4432217478752136,
      "rewards/semantic_correctness_reward_func/std": 0.20972265303134918,
      "rewards/xmlcount_reward_func/mean": 0.7591518759727478,
      "rewards/xmlcount_reward_func/std": 0.42889249324798584,
      "step": 606
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 900.0,
      "completions/mean_length": 148.90625,
      "completions/mean_terminated_length": 144.98207092285156,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.20724680978191284,
      "grad_norm": 0.021965689957141876,
      "kl": 0.017894744873046875,
      "learning_rate": 3.9335018462812664e-06,
      "loss": -0.0234,
      "num_tokens": 216636048.0,
      "reward": 0.4070368707180023,
      "reward_std": 0.06277727335691452,
      "rewards/gemini_judge_reward_func/mean": 0.1171875,
      "rewards/gemini_judge_reward_func/std": 0.21161696314811707,
      "rewards/semantic_correctness_reward_func/mean": 0.44230917096138,
      "rewards/semantic_correctness_reward_func/std": 0.20678167045116425,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 607
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 468.0,
      "completions/max_terminated_length": 468.0,
      "completions/mean_length": 146.91964721679688,
      "completions/mean_terminated_length": 146.91964721679688,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.20758823780461783,
      "grad_norm": 0.02472434565424919,
      "kl": 0.03131532669067383,
      "learning_rate": 3.9173535248813026e-06,
      "loss": 0.0065,
      "num_tokens": 216973558.0,
      "reward": 0.4831780791282654,
      "reward_std": 0.0640798807144165,
      "rewards/gemini_judge_reward_func/mean": 0.1852678507566452,
      "rewards/gemini_judge_reward_func/std": 0.28655508160591125,
      "rewards/semantic_correctness_reward_func/mean": 0.4723544120788574,
      "rewards/semantic_correctness_reward_func/std": 0.23760126531124115,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 608
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 545.0,
      "completions/mean_length": 166.8169708251953,
      "completions/mean_terminated_length": 151.2318115234375,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.20792966582732278,
      "grad_norm": 0.01872190646827221,
      "kl": 0.011590242385864258,
      "learning_rate": 3.901217043129735e-06,
      "loss": 0.008,
      "num_tokens": 217317493.0,
      "reward": 0.42884668707847595,
      "reward_std": 0.06799106299877167,
      "rewards/gemini_judge_reward_func/mean": 0.1104910746216774,
      "rewards/gemini_judge_reward_func/std": 0.23151718080043793,
      "rewards/semantic_correctness_reward_func/mean": 0.4217510223388672,
      "rewards/semantic_correctness_reward_func/std": 0.19558602571487427,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 609
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 338.0,
      "completions/mean_length": 159.5491180419922,
      "completions/mean_terminated_length": 143.83181762695312,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.20827109385002773,
      "grad_norm": 0.020979879423975945,
      "kl": 0.018959522247314453,
      "learning_rate": 3.885092577492543e-06,
      "loss": -0.009,
      "num_tokens": 217679120.0,
      "reward": 0.39373713731765747,
      "reward_std": 0.05377659946680069,
      "rewards/gemini_judge_reward_func/mean": 0.1015625,
      "rewards/gemini_judge_reward_func/std": 0.20642106235027313,
      "rewards/semantic_correctness_reward_func/mean": 0.4428104758262634,
      "rewards/semantic_correctness_reward_func/std": 0.2033531814813614,
      "rewards/xmlcount_reward_func/mean": 0.6613750457763672,
      "rewards/xmlcount_reward_func/std": 0.47500187158584595,
      "step": 610
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 457.0,
      "completions/max_terminated_length": 457.0,
      "completions/mean_length": 150.07144165039062,
      "completions/mean_terminated_length": 150.07144165039062,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.20861252187273271,
      "grad_norm": 0.021496569737792015,
      "kl": 0.01505589485168457,
      "learning_rate": 3.8689803043043e-06,
      "loss": 0.0073,
      "num_tokens": 218027112.0,
      "reward": 0.45133811235427856,
      "reward_std": 0.061567071825265884,
      "rewards/gemini_judge_reward_func/mean": 0.1015625,
      "rewards/gemini_judge_reward_func/std": 0.2170114368200302,
      "rewards/semantic_correctness_reward_func/mean": 0.409065306186676,
      "rewards/semantic_correctness_reward_func/std": 0.21651864051818848,
      "rewards/xmlcount_reward_func/mean": 0.8222500681877136,
      "rewards/xmlcount_reward_func/std": 0.3842346966266632,
      "step": 611
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 562.0,
      "completions/mean_length": 150.3616180419922,
      "completions/mean_terminated_length": 138.5022735595703,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.20895394989543767,
      "grad_norm": 0.021522024646401405,
      "kl": 0.01940298080444336,
      "learning_rate": 3.852880399766243e-06,
      "loss": -0.0166,
      "num_tokens": 218409965.0,
      "reward": 0.38020059466362,
      "reward_std": 0.05251891911029816,
      "rewards/gemini_judge_reward_func/mean": 0.1004464253783226,
      "rewards/gemini_judge_reward_func/std": 0.2330818623304367,
      "rewards/semantic_correctness_reward_func/mean": 0.4131101965904236,
      "rewards/semantic_correctness_reward_func/std": 0.1949501633644104,
      "rewards/xmlcount_reward_func/mean": 0.643500030040741,
      "rewards/xmlcount_reward_func/std": 0.48071083426475525,
      "step": 612
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 832.0,
      "completions/mean_length": 148.43304443359375,
      "completions/mean_terminated_length": 144.50672912597656,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.20929537791814262,
      "grad_norm": 0.021798672154545784,
      "kl": 0.01602315902709961,
      "learning_rate": 3.8367930399443495e-06,
      "loss": -0.0188,
      "num_tokens": 218773734.0,
      "reward": 0.39457279443740845,
      "reward_std": 0.059662409126758575,
      "rewards/gemini_judge_reward_func/mean": 0.1160714253783226,
      "rewards/gemini_judge_reward_func/std": 0.23776936531066895,
      "rewards/semantic_correctness_reward_func/mean": 0.4179709851741791,
      "rewards/semantic_correctness_reward_func/std": 0.2051243931055069,
      "rewards/xmlcount_reward_func/mean": 0.6613750457763672,
      "rewards/xmlcount_reward_func/std": 0.47500187158584595,
      "step": 613
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 865.0,
      "completions/max_terminated_length": 865.0,
      "completions/mean_length": 146.7053680419922,
      "completions/mean_terminated_length": 146.7053680419922,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.2096368059408476,
      "grad_norm": 0.021316422149538994,
      "kl": 0.017024993896484375,
      "learning_rate": 3.820718400767409e-06,
      "loss": -0.0117,
      "num_tokens": 219123324.0,
      "reward": 0.41686007380485535,
      "reward_std": 0.0637628361582756,
      "rewards/gemini_judge_reward_func/mean": 0.1049107164144516,
      "rewards/gemini_judge_reward_func/std": 0.21863414347171783,
      "rewards/semantic_correctness_reward_func/mean": 0.462336003780365,
      "rewards/semantic_correctness_reward_func/std": 0.19964618980884552,
      "rewards/xmlcount_reward_func/mean": 0.7060714364051819,
      "rewards/xmlcount_reward_func/std": 0.4524170756340027,
      "step": 614
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 942.0,
      "completions/mean_length": 164.5669708251953,
      "completions/mean_terminated_length": 152.90045166015625,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.20997823396355256,
      "grad_norm": 0.021478435024619102,
      "kl": 0.017641544342041016,
      "learning_rate": 3.8046566580251e-06,
      "loss": 0.0127,
      "num_tokens": 219468403.0,
      "reward": 0.44625985622406006,
      "reward_std": 0.043265651911497116,
      "rewards/gemini_judge_reward_func/mean": 0.1071428582072258,
      "rewards/gemini_judge_reward_func/std": 0.20149186253547668,
      "rewards/semantic_correctness_reward_func/mean": 0.4440133273601532,
      "rewards/semantic_correctness_reward_func/std": 0.21532121300697327,
      "rewards/xmlcount_reward_func/mean": 0.7864999771118164,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 615
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 685.0,
      "completions/mean_length": 157.15179443359375,
      "completions/mean_terminated_length": 145.38462829589844,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.2103196619862575,
      "grad_norm": 0.02019723691046238,
      "kl": 0.014769792556762695,
      "learning_rate": 3.7886079873660693e-06,
      "loss": -0.0266,
      "num_tokens": 219809233.0,
      "reward": 0.446768581867218,
      "reward_std": 0.06777739524841309,
      "rewards/gemini_judge_reward_func/mean": 0.1183035746216774,
      "rewards/gemini_judge_reward_func/std": 0.2482219785451889,
      "rewards/semantic_correctness_reward_func/mean": 0.42423561215400696,
      "rewards/semantic_correctness_reward_func/std": 0.18544653058052063,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 616
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 295.0,
      "completions/mean_length": 162.58482360839844,
      "completions/mean_terminated_length": 146.9227294921875,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.2106610900089625,
      "grad_norm": 0.02150016464293003,
      "kl": 0.015299558639526367,
      "learning_rate": 3.7725725642960047e-06,
      "loss": 0.0054,
      "num_tokens": 220172916.0,
      "reward": 0.448917955160141,
      "reward_std": 0.049533385783433914,
      "rewards/gemini_judge_reward_func/mean": 0.0970982164144516,
      "rewards/gemini_judge_reward_func/std": 0.22285966575145721,
      "rewards/semantic_correctness_reward_func/mean": 0.41483062505722046,
      "rewards/semantic_correctness_reward_func/std": 0.2128439098596573,
      "rewards/xmlcount_reward_func/mean": 0.8177813291549683,
      "rewards/xmlcount_reward_func/std": 0.3879494369029999,
      "step": 617
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 339.0,
      "completions/max_terminated_length": 339.0,
      "completions/mean_length": 142.2678680419922,
      "completions/mean_terminated_length": 142.2678680419922,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.21100251803166745,
      "grad_norm": 0.023679913952946663,
      "kl": 0.01729297637939453,
      "learning_rate": 3.756550564175727e-06,
      "loss": -0.0221,
      "num_tokens": 220527948.0,
      "reward": 0.4694575071334839,
      "reward_std": 0.08708461374044418,
      "rewards/gemini_judge_reward_func/mean": 0.1696428507566452,
      "rewards/gemini_judge_reward_func/std": 0.27956220507621765,
      "rewards/semantic_correctness_reward_func/mean": 0.4707517921924591,
      "rewards/semantic_correctness_reward_func/std": 0.2081519216299057,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 618
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 515.0,
      "completions/mean_length": 173.17857360839844,
      "completions/mean_terminated_length": 145.7327117919922,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.2113439460543724,
      "grad_norm": 0.02082088217139244,
      "kl": 0.014158248901367188,
      "learning_rate": 3.7405421622192607e-06,
      "loss": -0.02,
      "num_tokens": 220909056.0,
      "reward": 0.4316154420375824,
      "reward_std": 0.06896168738603592,
      "rewards/gemini_judge_reward_func/mean": 0.1350446492433548,
      "rewards/gemini_judge_reward_func/std": 0.24239464104175568,
      "rewards/semantic_correctness_reward_func/mean": 0.4688715636730194,
      "rewards/semantic_correctness_reward_func/std": 0.17802277207374573,
      "rewards/xmlcount_reward_func/mean": 0.7095580697059631,
      "rewards/xmlcount_reward_func/std": 0.45223551988601685,
      "step": 619
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 427.0,
      "completions/mean_length": 152.86607360839844,
      "completions/mean_terminated_length": 145.0180206298828,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.21168537407707738,
      "grad_norm": 0.020124070346355438,
      "kl": 0.014010190963745117,
      "learning_rate": 3.7245475334919246e-06,
      "loss": -0.0191,
      "num_tokens": 221245478.0,
      "reward": 0.41337040066719055,
      "reward_std": 0.0564776174724102,
      "rewards/gemini_judge_reward_func/mean": 0.1116071417927742,
      "rewards/gemini_judge_reward_func/std": 0.191873237490654,
      "rewards/semantic_correctness_reward_func/mean": 0.4136374294757843,
      "rewards/semantic_correctness_reward_func/std": 0.1768476963043213,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 620
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 927.0,
      "completions/mean_length": 159.0178680419922,
      "completions/mean_terminated_length": 151.22523498535156,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.21202680209978234,
      "grad_norm": 0.0207071453332901,
      "kl": 0.014678478240966797,
      "learning_rate": 3.7085668529084183e-06,
      "loss": -0.0211,
      "num_tokens": 221594150.0,
      "reward": 0.4323264956474304,
      "reward_std": 0.06878480315208435,
      "rewards/gemini_judge_reward_func/mean": 0.1015625,
      "rewards/gemini_judge_reward_func/std": 0.2170114368200302,
      "rewards/semantic_correctness_reward_func/mean": 0.4190162122249603,
      "rewards/semantic_correctness_reward_func/std": 0.20656156539916992,
      "rewards/xmlcount_reward_func/mean": 0.7697455286979675,
      "rewards/xmlcount_reward_func/std": 0.4218544363975525,
      "step": 621
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 776.0,
      "completions/mean_length": 166.12054443359375,
      "completions/mean_terminated_length": 146.53424072265625,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.21236823012248732,
      "grad_norm": 0.01952914334833622,
      "kl": 0.014682769775390625,
      "learning_rate": 3.6926002952309015e-06,
      "loss": -0.0159,
      "num_tokens": 221943565.0,
      "reward": 0.4331689774990082,
      "reward_std": 0.06450119614601135,
      "rewards/gemini_judge_reward_func/mean": 0.1651785671710968,
      "rewards/gemini_judge_reward_func/std": 0.24730288982391357,
      "rewards/semantic_correctness_reward_func/mean": 0.4769876301288605,
      "rewards/semantic_correctness_reward_func/std": 0.22601205110549927,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 622
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 284.0,
      "completions/mean_length": 150.64732360839844,
      "completions/mean_terminated_length": 142.77928161621094,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.21270965814519227,
      "grad_norm": 0.02059789188206196,
      "kl": 0.014481544494628906,
      "learning_rate": 3.676648035067093e-06,
      "loss": -0.0166,
      "num_tokens": 222291158.0,
      "reward": 0.42687147855758667,
      "reward_std": 0.06625192612409592,
      "rewards/gemini_judge_reward_func/mean": 0.1227678582072258,
      "rewards/gemini_judge_reward_func/std": 0.23555946350097656,
      "rewards/semantic_correctness_reward_func/mean": 0.4230715334415436,
      "rewards/semantic_correctness_reward_func/std": 0.18972373008728027,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 623
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 958.0,
      "completions/mean_length": 153.20982360839844,
      "completions/mean_terminated_length": 149.30494689941406,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.21305108616789722,
      "grad_norm": 0.020177403464913368,
      "kl": 0.01506662368774414,
      "learning_rate": 3.6607102468683524e-06,
      "loss": 0.002,
      "num_tokens": 222639001.0,
      "reward": 0.4309717118740082,
      "reward_std": 0.06068568304181099,
      "rewards/gemini_judge_reward_func/mean": 0.1294642835855484,
      "rewards/gemini_judge_reward_func/std": 0.24140411615371704,
      "rewards/semantic_correctness_reward_func/mean": 0.4302067160606384,
      "rewards/semantic_correctness_reward_func/std": 0.21803805232048035,
      "rewards/xmlcount_reward_func/mean": 0.7328616380691528,
      "rewards/xmlcount_reward_func/std": 0.4410996735095978,
      "step": 624
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 575.0,
      "completions/max_terminated_length": 575.0,
      "completions/mean_length": 154.47769165039062,
      "completions/mean_terminated_length": 154.47769165039062,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.2133925141906022,
      "grad_norm": 0.020750742405653,
      "kl": 0.018827438354492188,
      "learning_rate": 3.64478710492778e-06,
      "loss": -0.0,
      "num_tokens": 222961280.0,
      "reward": 0.4677436947822571,
      "reward_std": 0.056711845099925995,
      "rewards/gemini_judge_reward_func/mean": 0.1595982164144516,
      "rewards/gemini_judge_reward_func/std": 0.2677079737186432,
      "rewards/semantic_correctness_reward_func/mean": 0.4465217590332031,
      "rewards/semantic_correctness_reward_func/std": 0.2243902087211609,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 625
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.0,
      "completions/mean_length": 155.09375,
      "completions/mean_terminated_length": 143.29864501953125,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.21373394221330716,
      "grad_norm": 0.026382334530353546,
      "kl": 0.022363662719726562,
      "learning_rate": 3.628878783378302e-06,
      "loss": -0.013,
      "num_tokens": 223344509.0,
      "reward": 0.40488913655281067,
      "reward_std": 0.06225220486521721,
      "rewards/gemini_judge_reward_func/mean": 0.1216517835855484,
      "rewards/gemini_judge_reward_func/std": 0.23435331881046295,
      "rewards/semantic_correctness_reward_func/mean": 0.42264196276664734,
      "rewards/semantic_correctness_reward_func/std": 0.20638230443000793,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 626
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 669.0,
      "completions/mean_length": 161.2544708251953,
      "completions/mean_terminated_length": 145.56817626953125,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.2140753702360121,
      "grad_norm": 0.020892329514026642,
      "kl": 0.016319751739501953,
      "learning_rate": 3.6129854561907786e-06,
      "loss": -0.0176,
      "num_tokens": 223712174.0,
      "reward": 0.4206882119178772,
      "reward_std": 0.04852227121591568,
      "rewards/gemini_judge_reward_func/mean": 0.0881696417927742,
      "rewards/gemini_judge_reward_func/std": 0.17807073891162872,
      "rewards/semantic_correctness_reward_func/mean": 0.4256015419960022,
      "rewards/semantic_correctness_reward_func/std": 0.20432168245315552,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 627
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 582.0,
      "completions/mean_length": 155.27679443359375,
      "completions/mean_terminated_length": 147.45045471191406,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.2144167982587171,
      "grad_norm": 0.023052629083395004,
      "kl": 0.01653289794921875,
      "learning_rate": 3.5971072971720844e-06,
      "loss": 0.0098,
      "num_tokens": 224050572.0,
      "reward": 0.4531470239162445,
      "reward_std": 0.05186094716191292,
      "rewards/gemini_judge_reward_func/mean": 0.1439732164144516,
      "rewards/gemini_judge_reward_func/std": 0.2692737579345703,
      "rewards/semantic_correctness_reward_func/mean": 0.44053834676742554,
      "rewards/semantic_correctness_reward_func/std": 0.2101813703775406,
      "rewards/xmlcount_reward_func/mean": 0.7686249613761902,
      "rewards/xmlcount_reward_func/std": 0.42356839776039124,
      "step": 628
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 789.0,
      "completions/mean_length": 170.14732360839844,
      "completions/mean_terminated_length": 146.64678955078125,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.21475822628142205,
      "grad_norm": 0.02114696241915226,
      "kl": 0.019293546676635742,
      "learning_rate": 3.581244479963225e-06,
      "loss": -0.0485,
      "num_tokens": 224408285.0,
      "reward": 0.42841026186943054,
      "reward_std": 0.07736083120107651,
      "rewards/gemini_judge_reward_func/mean": 0.125,
      "rewards/gemini_judge_reward_func/std": 0.22708921134471893,
      "rewards/semantic_correctness_reward_func/mean": 0.43763142824172974,
      "rewards/semantic_correctness_reward_func/std": 0.21282999217510223,
      "rewards/xmlcount_reward_func/mean": 0.7272098660469055,
      "rewards/xmlcount_reward_func/std": 0.4462122321128845,
      "step": 629
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 328.0,
      "completions/mean_length": 147.69644165039062,
      "completions/mean_terminated_length": 139.8018035888672,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.215099654304127,
      "grad_norm": 0.02138863503932953,
      "kl": 0.016696453094482422,
      "learning_rate": 3.56539717803743e-06,
      "loss": -0.0161,
      "num_tokens": 224749813.0,
      "reward": 0.44623899459838867,
      "reward_std": 0.06276748329401016,
      "rewards/gemini_judge_reward_func/mean": 0.1049107164144516,
      "rewards/gemini_judge_reward_func/std": 0.22244662046432495,
      "rewards/semantic_correctness_reward_func/mean": 0.4841232895851135,
      "rewards/semantic_correctness_reward_func/std": 0.20933982729911804,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356839776039124,
      "step": 630
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 973.0,
      "completions/mean_length": 146.79019165039062,
      "completions/mean_terminated_length": 142.85650634765625,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.21544108232683198,
      "grad_norm": 0.0239239614456892,
      "kl": 0.018682479858398438,
      "learning_rate": 3.5495655646982506e-06,
      "loss": -0.0027,
      "num_tokens": 225095170.0,
      "reward": 0.44244566559791565,
      "reward_std": 0.0652119442820549,
      "rewards/gemini_judge_reward_func/mean": 0.1573660671710968,
      "rewards/gemini_judge_reward_func/std": 0.27110758423805237,
      "rewards/semantic_correctness_reward_func/mean": 0.46749600768089294,
      "rewards/semantic_correctness_reward_func/std": 0.21619705855846405,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 631
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 297.0,
      "completions/max_terminated_length": 297.0,
      "completions/mean_length": 144.5357208251953,
      "completions/mean_terminated_length": 144.5357208251953,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.21578251034953694,
      "grad_norm": 0.0239239614456892,
      "kl": 0.016391754150390625,
      "learning_rate": 3.5495655646982506e-06,
      "loss": 0.0224,
      "num_tokens": 225443274.0,
      "reward": 0.4401990473270416,
      "reward_std": 0.06552834808826447,
      "rewards/gemini_judge_reward_func/mean": 0.1127232164144516,
      "rewards/gemini_judge_reward_func/std": 0.21921423077583313,
      "rewards/semantic_correctness_reward_func/mean": 0.41371825337409973,
      "rewards/semantic_correctness_reward_func/std": 0.21398547291755676,
      "rewards/xmlcount_reward_func/mean": 0.7809152603149414,
      "rewards/xmlcount_reward_func/std": 0.4144832491874695,
      "step": 632
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 677.0,
      "completions/mean_length": 157.38839721679688,
      "completions/mean_terminated_length": 149.58108520507812,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.2161239383722419,
      "grad_norm": 0.019696949049830437,
      "kl": 0.017939090728759766,
      "learning_rate": 3.533749813077677e-06,
      "loss": -0.05,
      "num_tokens": 225808005.0,
      "reward": 0.46345487236976624,
      "reward_std": 0.08401855826377869,
      "rewards/gemini_judge_reward_func/mean": 0.1540178507566452,
      "rewards/gemini_judge_reward_func/std": 0.27251842617988586,
      "rewards/semantic_correctness_reward_func/mean": 0.4541133940219879,
      "rewards/semantic_correctness_reward_func/std": 0.20422282814979553,
      "rewards/xmlcount_reward_func/mean": 0.7775625586509705,
      "rewards/xmlcount_reward_func/std": 0.4177508056163788,
      "step": 633
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 175.32144165039062,
      "completions/mean_terminated_length": 155.94520568847656,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.21646536639494687,
      "grad_norm": 0.020858891308307648,
      "kl": 0.016012191772460938,
      "learning_rate": 3.517950096134232e-06,
      "loss": -0.0292,
      "num_tokens": 226140553.0,
      "reward": 0.4431969225406647,
      "reward_std": 0.05470741167664528,
      "rewards/gemini_judge_reward_func/mean": 0.1104910746216774,
      "rewards/gemini_judge_reward_func/std": 0.21389874815940857,
      "rewards/semantic_correctness_reward_func/mean": 0.4220024645328522,
      "rewards/semantic_correctness_reward_func/std": 0.20806531608104706,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 634
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1007.0,
      "completions/mean_length": 158.625,
      "completions/mean_terminated_length": 154.74440002441406,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.21680679441765183,
      "grad_norm": 0.020396729931235313,
      "kl": 0.012579917907714844,
      "learning_rate": 3.5021665866510924e-06,
      "loss": -0.013,
      "num_tokens": 226466773.0,
      "reward": 0.45861443877220154,
      "reward_std": 0.06587579846382141,
      "rewards/gemini_judge_reward_func/mean": 0.1194196417927742,
      "rewards/gemini_judge_reward_func/std": 0.22202719748020172,
      "rewards/semantic_correctness_reward_func/mean": 0.4454827308654785,
      "rewards/semantic_correctness_reward_func/std": 0.2230159491300583,
      "rewards/xmlcount_reward_func/mean": 0.8043751120567322,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 635
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 159.45982360839844,
      "completions/mean_terminated_length": 147.72398376464844,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.21714822244035678,
      "grad_norm": 0.02042062021791935,
      "kl": 0.015841245651245117,
      "learning_rate": 3.4863994572341845e-06,
      "loss": -0.0106,
      "num_tokens": 226816248.0,
      "reward": 0.42706283926963806,
      "reward_std": 0.06164560839533806,
      "rewards/gemini_judge_reward_func/mean": 0.1116071417927742,
      "rewards/gemini_judge_reward_func/std": 0.23037132620811462,
      "rewards/semantic_correctness_reward_func/mean": 0.41059979796409607,
      "rewards/semantic_correctness_reward_func/std": 0.1852397918701172,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 636
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 287.0,
      "completions/mean_length": 157.96875,
      "completions/mean_terminated_length": 150.1666717529297,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.21748965046306176,
      "grad_norm": 0.02099723368883133,
      "kl": 0.012554168701171875,
      "learning_rate": 3.470648880310313e-06,
      "loss": -0.0144,
      "num_tokens": 227145813.0,
      "reward": 0.5042933821678162,
      "reward_std": 0.06609771400690079,
      "rewards/gemini_judge_reward_func/mean": 0.1707589328289032,
      "rewards/gemini_judge_reward_func/std": 0.2563627362251282,
      "rewards/semantic_correctness_reward_func/mean": 0.4639488756656647,
      "rewards/semantic_correctness_reward_func/std": 0.22148670256137848,
      "rewards/xmlcount_reward_func/mean": 0.8580000996589661,
      "rewards/xmlcount_reward_func/std": 0.35106155276298523,
      "step": 637
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 396.0,
      "completions/mean_length": 160.8794708251953,
      "completions/mean_terminated_length": 149.16290283203125,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.21783107848576672,
      "grad_norm": 0.020512668415904045,
      "kl": 0.01687908172607422,
      "learning_rate": 3.4549150281252635e-06,
      "loss": 0.0107,
      "num_tokens": 227496678.0,
      "reward": 0.435072660446167,
      "reward_std": 0.05142156034708023,
      "rewards/gemini_judge_reward_func/mean": 0.1049107164144516,
      "rewards/gemini_judge_reward_func/std": 0.20812636613845825,
      "rewards/semantic_correctness_reward_func/mean": 0.4282917380332947,
      "rewards/semantic_correctness_reward_func/std": 0.22628170251846313,
      "rewards/xmlcount_reward_func/mean": 0.7686249613761902,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 638
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 283.0,
      "completions/max_terminated_length": 283.0,
      "completions/mean_length": 148.72769165039062,
      "completions/mean_terminated_length": 148.72769165039062,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "epoch": 0.21817250650847167,
      "grad_norm": 0.021282676607370377,
      "kl": 0.013463735580444336,
      "learning_rate": 3.4391980727419206e-06,
      "loss": 0.0127,
      "num_tokens": 227858717.0,
      "reward": 0.4656349718570709,
      "reward_std": 0.08616151660680771,
      "rewards/gemini_judge_reward_func/mean": 0.1428571492433548,
      "rewards/gemini_judge_reward_func/std": 0.26726123690605164,
      "rewards/semantic_correctness_reward_func/mean": 0.4337104260921478,
      "rewards/semantic_correctness_reward_func/std": 0.2192811518907547,
      "rewards/xmlcount_reward_func/mean": 0.8043750524520874,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 639
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 813.0,
      "completions/mean_length": 173.65626525878906,
      "completions/mean_terminated_length": 150.25228881835938,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.21851393453117665,
      "grad_norm": 0.018349410966038704,
      "kl": 0.014457941055297852,
      "learning_rate": 3.423498186038393e-06,
      "loss": -0.0335,
      "num_tokens": 228224992.0,
      "reward": 0.4189697206020355,
      "reward_std": 0.06830798089504242,
      "rewards/gemini_judge_reward_func/mean": 0.1328125,
      "rewards/gemini_judge_reward_func/std": 0.26774534583091736,
      "rewards/semantic_correctness_reward_func/mean": 0.4349732995033264,
      "rewards/semantic_correctness_reward_func/std": 0.21521888673305511,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903892993927,
      "step": 640
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 663.0,
      "completions/mean_length": 159.75,
      "completions/mean_terminated_length": 151.96397399902344,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.2188553625538816,
      "grad_norm": 0.02072848007082939,
      "kl": 0.014017105102539062,
      "learning_rate": 3.4078155397061243e-06,
      "loss": 0.0124,
      "num_tokens": 228581368.0,
      "reward": 0.43587082624435425,
      "reward_std": 0.06837836652994156,
      "rewards/gemini_judge_reward_func/mean": 0.1272321492433548,
      "rewards/gemini_judge_reward_func/std": 0.23316773772239685,
      "rewards/semantic_correctness_reward_func/mean": 0.4233896732330322,
      "rewards/semantic_correctness_reward_func/std": 0.21983756124973297,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 641
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 882.0,
      "completions/mean_length": 175.80357360839844,
      "completions/mean_terminated_length": 156.4383544921875,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.2191967905765866,
      "grad_norm": 0.0204218327999115,
      "kl": 0.015043258666992188,
      "learning_rate": 3.3921503052480243e-06,
      "loss": -0.0068,
      "num_tokens": 228939304.0,
      "reward": 0.41249316930770874,
      "reward_std": 0.04543416202068329,
      "rewards/gemini_judge_reward_func/mean": 0.09375,
      "rewards/gemini_judge_reward_func/std": 0.20537890493869781,
      "rewards/semantic_correctness_reward_func/mean": 0.41145679354667664,
      "rewards/semantic_correctness_reward_func/std": 0.19346196949481964,
      "rewards/xmlcount_reward_func/mean": 0.7317544221878052,
      "rewards/xmlcount_reward_func/std": 0.4439154863357544,
      "step": 642
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 898.0,
      "completions/mean_length": 157.9375,
      "completions/mean_terminated_length": 146.18099975585938,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.21953821859929154,
      "grad_norm": 0.020854616537690163,
      "kl": 0.014810562133789062,
      "learning_rate": 3.3765026539765832e-06,
      "loss": -0.028,
      "num_tokens": 229304102.0,
      "reward": 0.4532719552516937,
      "reward_std": 0.07113207876682281,
      "rewards/gemini_judge_reward_func/mean": 0.1428571492433548,
      "rewards/gemini_judge_reward_func/std": 0.26935040950775146,
      "rewards/semantic_correctness_reward_func/mean": 0.4612525403499603,
      "rewards/semantic_correctness_reward_func/std": 0.216531440615654,
      "rewards/xmlcount_reward_func/mean": 0.7596964240074158,
      "rewards/xmlcount_reward_func/std": 0.423846960067749,
      "step": 643
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 611.0,
      "completions/mean_length": 174.79019165039062,
      "completions/mean_terminated_length": 147.3963165283203,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.2198796466219965,
      "grad_norm": 0.022890524938702583,
      "kl": 0.015017271041870117,
      "learning_rate": 3.3608727570120114e-06,
      "loss": 0.0367,
      "num_tokens": 229678923.0,
      "reward": 0.40588462352752686,
      "reward_std": 0.05438840389251709,
      "rewards/gemini_judge_reward_func/mean": 0.1305803507566452,
      "rewards/gemini_judge_reward_func/std": 0.2719554007053375,
      "rewards/semantic_correctness_reward_func/mean": 0.42761939764022827,
      "rewards/semantic_correctness_reward_func/std": 0.22883787751197815,
      "rewards/xmlcount_reward_func/mean": 0.6703214645385742,
      "rewards/xmlcount_reward_func/std": 0.4670778214931488,
      "step": 644
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 273.0,
      "completions/max_terminated_length": 273.0,
      "completions/mean_length": 143.00894165039062,
      "completions/mean_terminated_length": 143.00894165039062,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.22022107464470148,
      "grad_norm": 0.020465996116399765,
      "kl": 0.012059688568115234,
      "learning_rate": 3.3452607852803585e-06,
      "loss": 0.0032,
      "num_tokens": 230010125.0,
      "reward": 0.48016557097435,
      "reward_std": 0.07235633581876755,
      "rewards/gemini_judge_reward_func/mean": 0.1495535671710968,
      "rewards/gemini_judge_reward_func/std": 0.2515864968299866,
      "rewards/semantic_correctness_reward_func/mean": 0.45722055435180664,
      "rewards/semantic_correctness_reward_func/std": 0.24177604913711548,
      "rewards/xmlcount_reward_func/mean": 0.8222500681877136,
      "rewards/xmlcount_reward_func/std": 0.3842346966266632,
      "step": 645
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 567.0,
      "completions/mean_length": 164.90179443359375,
      "completions/mean_terminated_length": 145.28765869140625,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.22056250266740643,
      "grad_norm": 0.021560531109571457,
      "kl": 0.017102718353271484,
      "learning_rate": 3.3296669095116454e-06,
      "loss": -0.0028,
      "num_tokens": 230365327.0,
      "reward": 0.44443637132644653,
      "reward_std": 0.09491990506649017,
      "rewards/gemini_judge_reward_func/mean": 0.1886160671710968,
      "rewards/gemini_judge_reward_func/std": 0.3165358304977417,
      "rewards/semantic_correctness_reward_func/mean": 0.4797532856464386,
      "rewards/semantic_correctness_reward_func/std": 0.20572076737880707,
      "rewards/xmlcount_reward_func/mean": 0.6825982332229614,
      "rewards/xmlcount_reward_func/std": 0.46513426303863525,
      "step": 646
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 531.0,
      "completions/mean_length": 159.3303680419922,
      "completions/mean_terminated_length": 143.6090850830078,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.22090393069011138,
      "grad_norm": 0.022877030074596405,
      "kl": 0.021349430084228516,
      "learning_rate": 3.3140913002379993e-06,
      "loss": -0.0177,
      "num_tokens": 230736949.0,
      "reward": 0.4325730800628662,
      "reward_std": 0.07304774224758148,
      "rewards/gemini_judge_reward_func/mean": 0.1674107164144516,
      "rewards/gemini_judge_reward_func/std": 0.29071658849716187,
      "rewards/semantic_correctness_reward_func/mean": 0.4695436656475067,
      "rewards/semantic_correctness_reward_func/std": 0.2264811396598816,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853896975517273,
      "step": 647
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 811.0,
      "completions/mean_length": 179.88394165039062,
      "completions/mean_terminated_length": 160.6118621826172,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.22124535871281636,
      "grad_norm": 0.019290877506136894,
      "kl": 0.013090372085571289,
      "learning_rate": 3.298534127791785e-06,
      "loss": -0.0044,
      "num_tokens": 231096655.0,
      "reward": 0.4430634081363678,
      "reward_std": 0.06918629258871078,
      "rewards/gemini_judge_reward_func/mean": 0.1417410671710968,
      "rewards/gemini_judge_reward_func/std": 0.2735517919063568,
      "rewards/semantic_correctness_reward_func/mean": 0.4303347170352936,
      "rewards/semantic_correctness_reward_func/std": 0.23128339648246765,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 648
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 912.0,
      "completions/mean_length": 150.9241180419922,
      "completions/mean_terminated_length": 143.05856323242188,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.22158678673552132,
      "grad_norm": 0.022577356547117233,
      "kl": 0.017051219940185547,
      "learning_rate": 3.2829955623037536e-06,
      "loss": -0.0341,
      "num_tokens": 231438194.0,
      "reward": 0.4817308783531189,
      "reward_std": 0.07706872373819351,
      "rewards/gemini_judge_reward_func/mean": 0.2109375,
      "rewards/gemini_judge_reward_func/std": 0.3183174729347229,
      "rewards/semantic_correctness_reward_func/mean": 0.4852793514728546,
      "rewards/semantic_correctness_reward_func/std": 0.22220410406589508,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 649
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0357142857142857,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 913.0,
      "completions/mean_length": 176.82144165039062,
      "completions/mean_terminated_length": 145.44444274902344,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.22192821475822627,
      "grad_norm": 0.02091185748577118,
      "kl": 0.01692962646484375,
      "learning_rate": 3.267475773701161e-06,
      "loss": -0.0186,
      "num_tokens": 231813410.0,
      "reward": 0.41409575939178467,
      "reward_std": 0.0569186732172966,
      "rewards/gemini_judge_reward_func/mean": 0.140625,
      "rewards/gemini_judge_reward_func/std": 0.24554608762264252,
      "rewards/semantic_correctness_reward_func/mean": 0.43966618180274963,
      "rewards/semantic_correctness_reward_func/std": 0.20007139444351196,
      "rewards/xmlcount_reward_func/mean": 0.6747812628746033,
      "rewards/xmlcount_reward_func/std": 0.4702270030975342,
      "step": 650
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 888.0,
      "completions/mean_length": 158.32589721679688,
      "completions/mean_terminated_length": 150.5270233154297,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.22226964278093125,
      "grad_norm": 0.021968696266412735,
      "kl": 0.017704010009765625,
      "learning_rate": 3.251974931705933e-06,
      "loss": -0.0113,
      "num_tokens": 232185319.0,
      "reward": 0.39724671840667725,
      "reward_std": 0.0701524019241333,
      "rewards/gemini_judge_reward_func/mean": 0.1383928507566452,
      "rewards/gemini_judge_reward_func/std": 0.2468166947364807,
      "rewards/semantic_correctness_reward_func/mean": 0.44034072756767273,
      "rewards/semantic_correctness_reward_func/std": 0.22161860764026642,
      "rewards/xmlcount_reward_func/mean": 0.6345536112785339,
      "rewards/xmlcount_reward_func/std": 0.4786224365234375,
      "step": 651
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 857.0,
      "completions/mean_length": 154.36607360839844,
      "completions/mean_terminated_length": 142.56109619140625,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.2226110708036362,
      "grad_norm": 0.02309529297053814,
      "kl": 0.01496124267578125,
      "learning_rate": 3.236493205832795e-06,
      "loss": -0.016,
      "num_tokens": 232526501.0,
      "reward": 0.43352967500686646,
      "reward_std": 0.060899149626493454,
      "rewards/gemini_judge_reward_func/mean": 0.109375,
      "rewards/gemini_judge_reward_func/std": 0.22530809044837952,
      "rewards/semantic_correctness_reward_func/mean": 0.42160341143608093,
      "rewards/semantic_correctness_reward_func/std": 0.24161309003829956,
      "rewards/xmlcount_reward_func/mean": 0.7636473774909973,
      "rewards/xmlcount_reward_func/std": 0.42616090178489685,
      "step": 652
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 165.7991180419922,
      "completions/mean_terminated_length": 150.19544982910156,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.22295249882634116,
      "grad_norm": 0.020331187173724174,
      "kl": 0.018056392669677734,
      "learning_rate": 3.2210307653874175e-06,
      "loss": -0.0365,
      "num_tokens": 232879984.0,
      "reward": 0.4239354431629181,
      "reward_std": 0.06748870015144348,
      "rewards/gemini_judge_reward_func/mean": 0.1160714253783226,
      "rewards/gemini_judge_reward_func/std": 0.24011527001857758,
      "rewards/semantic_correctness_reward_func/mean": 0.42178425192832947,
      "rewards/semantic_correctness_reward_func/std": 0.19532445073127747,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 653
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 931.0,
      "completions/max_terminated_length": 931.0,
      "completions/mean_length": 151.60714721679688,
      "completions/mean_terminated_length": 151.60714721679688,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.22329392684904614,
      "grad_norm": 0.019777696579694748,
      "kl": 0.015816688537597656,
      "learning_rate": 3.205587779464576e-06,
      "loss": -0.0058,
      "num_tokens": 233223568.0,
      "reward": 0.4531329274177551,
      "reward_std": 0.06646832823753357,
      "rewards/gemini_judge_reward_func/mean": 0.1841517835855484,
      "rewards/gemini_judge_reward_func/std": 0.2982853949069977,
      "rewards/semantic_correctness_reward_func/mean": 0.4673609137535095,
      "rewards/semantic_correctness_reward_func/std": 0.23493170738220215,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 654
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 249.0,
      "completions/mean_length": 155.05357360839844,
      "completions/mean_terminated_length": 143.25791931152344,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.2236353548717511,
      "grad_norm": 0.02039613015949726,
      "kl": 0.014897584915161133,
      "learning_rate": 3.1901644169462854e-06,
      "loss": -0.0026,
      "num_tokens": 233578128.0,
      "reward": 0.46499761939048767,
      "reward_std": 0.0666971355676651,
      "rewards/gemini_judge_reward_func/mean": 0.1674107164144516,
      "rewards/gemini_judge_reward_func/std": 0.27689096331596375,
      "rewards/semantic_correctness_reward_func/mean": 0.4529164731502533,
      "rewards/semantic_correctness_reward_func/std": 0.22561688721179962,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 655
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 828.0,
      "completions/mean_length": 156.30357360839844,
      "completions/mean_terminated_length": 144.52488708496094,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.22397678289445605,
      "grad_norm": 0.020914804190397263,
      "kl": 0.016788959503173828,
      "learning_rate": 3.1747608464999723e-06,
      "loss": 0.0177,
      "num_tokens": 233943512.0,
      "reward": 0.4334987998008728,
      "reward_std": 0.05882357805967331,
      "rewards/gemini_judge_reward_func/mean": 0.1316964328289032,
      "rewards/gemini_judge_reward_func/std": 0.2698609232902527,
      "rewards/semantic_correctness_reward_func/mean": 0.43835094571113586,
      "rewards/semantic_correctness_reward_func/std": 0.20942819118499756,
      "rewards/xmlcount_reward_func/mean": 0.7328750491142273,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 656
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 327.0,
      "completions/mean_length": 153.4732208251953,
      "completions/mean_terminated_length": 145.63063049316406,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.22431821091716103,
      "grad_norm": 0.01971406303346157,
      "kl": 0.014064550399780273,
      "learning_rate": 3.1593772365766107e-06,
      "loss": -0.0151,
      "num_tokens": 234292622.0,
      "reward": 0.45534926652908325,
      "reward_std": 0.06678459793329239,
      "rewards/gemini_judge_reward_func/mean": 0.1651785671710968,
      "rewards/gemini_judge_reward_func/std": 0.30513831973075867,
      "rewards/semantic_correctness_reward_func/mean": 0.44488900899887085,
      "rewards/semantic_correctness_reward_func/std": 0.22186164557933807,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 657
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 673.0,
      "completions/mean_length": 152.52232360839844,
      "completions/mean_terminated_length": 148.61436462402344,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.22465963893986599,
      "grad_norm": 0.021667256951332092,
      "kl": 0.014690876007080078,
      "learning_rate": 3.1440137554088957e-06,
      "loss": 0.0193,
      "num_tokens": 234646675.0,
      "reward": 0.42377644777297974,
      "reward_std": 0.05661001801490784,
      "rewards/gemini_judge_reward_func/mean": 0.1116071417927742,
      "rewards/gemini_judge_reward_func/std": 0.23158472776412964,
      "rewards/semantic_correctness_reward_func/mean": 0.3941677212715149,
      "rewards/semantic_correctness_reward_func/std": 0.18619193136692047,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 658
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 452.0,
      "completions/mean_length": 166.2053680419922,
      "completions/mean_terminated_length": 154.56109619140625,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.22500106696257094,
      "grad_norm": 0.021019073203206062,
      "kl": 0.016040325164794922,
      "learning_rate": 3.128670571009399e-06,
      "loss": -0.0183,
      "num_tokens": 235011845.0,
      "reward": 0.45211324095726013,
      "reward_std": 0.07263405621051788,
      "rewards/gemini_judge_reward_func/mean": 0.1439732164144516,
      "rewards/gemini_judge_reward_func/std": 0.23717662692070007,
      "rewards/semantic_correctness_reward_func/mean": 0.4532356858253479,
      "rewards/semantic_correctness_reward_func/std": 0.22988630831241608,
      "rewards/xmlcount_reward_func/mean": 0.7596920728683472,
      "rewards/xmlcount_reward_func/std": 0.42566749453544617,
      "step": 659
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 711.0,
      "completions/mean_length": 153.04464721679688,
      "completions/mean_terminated_length": 145.1981964111328,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.22534249498527592,
      "grad_norm": 0.02109494060277939,
      "kl": 0.014479875564575195,
      "learning_rate": 3.1133478511687217e-06,
      "loss": -0.0027,
      "num_tokens": 235377351.0,
      "reward": 0.43420976400375366,
      "reward_std": 0.0640973299741745,
      "rewards/gemini_judge_reward_func/mean": 0.1350446492433548,
      "rewards/gemini_judge_reward_func/std": 0.2365427315235138,
      "rewards/semantic_correctness_reward_func/mean": 0.43520933389663696,
      "rewards/semantic_correctness_reward_func/std": 0.20545201003551483,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 660
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 378.0,
      "completions/mean_length": 151.85714721679688,
      "completions/mean_terminated_length": 136.0,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.22568392300798087,
      "grad_norm": 0.02080109529197216,
      "kl": 0.018342018127441406,
      "learning_rate": 3.0980457634536775e-06,
      "loss": 0.003,
      "num_tokens": 235770759.0,
      "reward": 0.38216474652290344,
      "reward_std": 0.05819929018616676,
      "rewards/gemini_judge_reward_func/mean": 0.1082589253783226,
      "rewards/gemini_judge_reward_func/std": 0.2264685183763504,
      "rewards/semantic_correctness_reward_func/mean": 0.3983592689037323,
      "rewards/semantic_correctness_reward_func/std": 0.18890590965747833,
      "rewards/xmlcount_reward_func/mean": 0.6479732394218445,
      "rewards/xmlcount_reward_func/std": 0.47640955448150635,
      "step": 661
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 877.0,
      "completions/mean_length": 165.0803680419922,
      "completions/mean_terminated_length": 161.2287139892578,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.22602535103068586,
      "grad_norm": 0.019101275131106377,
      "kl": 0.011385202407836914,
      "learning_rate": 3.082764475205442e-06,
      "loss": -0.0045,
      "num_tokens": 236104989.0,
      "reward": 0.48020070791244507,
      "reward_std": 0.07216404378414154,
      "rewards/gemini_judge_reward_func/mean": 0.1551339328289032,
      "rewards/gemini_judge_reward_func/std": 0.2785017192363739,
      "rewards/semantic_correctness_reward_func/mean": 0.4462354779243469,
      "rewards/semantic_correctness_reward_func/std": 0.223235622048378,
      "rewards/xmlcount_reward_func/mean": 0.8222500085830688,
      "rewards/xmlcount_reward_func/std": 0.3842346966266632,
      "step": 662
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 807.0,
      "completions/mean_length": 166.8616180419922,
      "completions/mean_terminated_length": 151.2772674560547,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.2263667790533908,
      "grad_norm": 0.020711250603199005,
      "kl": 0.016867637634277344,
      "learning_rate": 3.06750415353774e-06,
      "loss": 0.0041,
      "num_tokens": 236459250.0,
      "reward": 0.45459437370300293,
      "reward_std": 0.07294096797704697,
      "rewards/gemini_judge_reward_func/mean": 0.1283482164144516,
      "rewards/gemini_judge_reward_func/std": 0.23554618656635284,
      "rewards/semantic_correctness_reward_func/mean": 0.4790251851081848,
      "rewards/semantic_correctness_reward_func/std": 0.18285901844501495,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 663
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 658.0,
      "completions/mean_length": 150.6875,
      "completions/mean_terminated_length": 146.77130126953125,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.22670820707609576,
      "grad_norm": 0.021186042577028275,
      "kl": 0.016530513763427734,
      "learning_rate": 3.052264965335e-06,
      "loss": 0.0128,
      "num_tokens": 236802736.0,
      "reward": 0.4258687198162079,
      "reward_std": 0.05062773451209068,
      "rewards/gemini_judge_reward_func/mean": 0.1071428582072258,
      "rewards/gemini_judge_reward_func/std": 0.2110041379928589,
      "rewards/semantic_correctness_reward_func/mean": 0.3867451846599579,
      "rewards/semantic_correctness_reward_func/std": 0.18891946971416473,
      "rewards/xmlcount_reward_func/mean": 0.7641562819480896,
      "rewards/xmlcount_reward_func/std": 0.4263768792152405,
      "step": 664
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 330.0,
      "completions/mean_length": 157.95982360839844,
      "completions/mean_terminated_length": 150.15765380859375,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.22704963509880075,
      "grad_norm": 0.01883574016392231,
      "kl": 0.01505136489868164,
      "learning_rate": 3.0370470772505433e-06,
      "loss": 0.0167,
      "num_tokens": 237155743.0,
      "reward": 0.4579858183860779,
      "reward_std": 0.07837632298469543,
      "rewards/gemini_judge_reward_func/mean": 0.1796875,
      "rewards/gemini_judge_reward_func/std": 0.28767722845077515,
      "rewards/semantic_correctness_reward_func/mean": 0.42905402183532715,
      "rewards/semantic_correctness_reward_func/std": 0.21172691881656647,
      "rewards/xmlcount_reward_func/mean": 0.7507500648498535,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 665
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 818.0,
      "completions/mean_length": 149.35269165039062,
      "completions/mean_terminated_length": 141.4729766845703,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.2273910631215057,
      "grad_norm": 0.02011125721037388,
      "kl": 0.01688098907470703,
      "learning_rate": 3.02185065570476e-06,
      "loss": 0.0042,
      "num_tokens": 237510790.0,
      "reward": 0.3963378071784973,
      "reward_std": 0.05887097865343094,
      "rewards/gemini_judge_reward_func/mean": 0.1183035746216774,
      "rewards/gemini_judge_reward_func/std": 0.23666170239448547,
      "rewards/semantic_correctness_reward_func/mean": 0.42233186960220337,
      "rewards/semantic_correctness_reward_func/std": 0.19566462934017181,
      "rewards/xmlcount_reward_func/mean": 0.6613750457763672,
      "rewards/xmlcount_reward_func/std": 0.47500187158584595,
      "step": 666
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 772.0,
      "completions/mean_length": 165.80804443359375,
      "completions/mean_terminated_length": 154.1583709716797,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "epoch": 0.22773249114421065,
      "grad_norm": 0.020004237070679665,
      "kl": 0.015186309814453125,
      "learning_rate": 3.0066758668832752e-06,
      "loss": -0.023,
      "num_tokens": 237851751.0,
      "reward": 0.4449497163295746,
      "reward_std": 0.060879725962877274,
      "rewards/gemini_judge_reward_func/mean": 0.1037946417927742,
      "rewards/gemini_judge_reward_func/std": 0.22978128492832184,
      "rewards/semantic_correctness_reward_func/mean": 0.39502519369125366,
      "rewards/semantic_correctness_reward_func/std": 0.21134421229362488,
      "rewards/xmlcount_reward_func/mean": 0.8110670447349548,
      "rewards/xmlcount_reward_func/std": 0.3861640691757202,
      "step": 667
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 334.0,
      "completions/mean_length": 132.1428680419922,
      "completions/mean_terminated_length": 128.1435089111328,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.22807391916691563,
      "grad_norm": 0.02160588651895523,
      "kl": 0.018302440643310547,
      "learning_rate": 2.991522876735154e-06,
      "loss": 0.0054,
      "num_tokens": 238195039.0,
      "reward": 0.4097975790500641,
      "reward_std": 0.07103915512561798,
      "rewards/gemini_judge_reward_func/mean": 0.1517857164144516,
      "rewards/gemini_judge_reward_func/std": 0.24227328598499298,
      "rewards/semantic_correctness_reward_func/mean": 0.4584164023399353,
      "rewards/semantic_correctness_reward_func/std": 0.2187541127204895,
      "rewards/xmlcount_reward_func/mean": 0.643500030040741,
      "rewards/xmlcount_reward_func/std": 0.48071083426475525,
      "step": 668
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 723.0,
      "completions/mean_length": 168.3616180419922,
      "completions/mean_terminated_length": 156.74661254882812,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.2284153471896206,
      "grad_norm": 0.021200206130743027,
      "kl": 0.015937328338623047,
      "learning_rate": 2.9763918509710647e-06,
      "loss": -0.0123,
      "num_tokens": 238590452.0,
      "reward": 0.429336816072464,
      "reward_std": 0.07005873322486877,
      "rewards/gemini_judge_reward_func/mean": 0.1450892835855484,
      "rewards/gemini_judge_reward_func/std": 0.2733252942562103,
      "rewards/semantic_correctness_reward_func/mean": 0.4533087909221649,
      "rewards/semantic_correctness_reward_func/std": 0.230568528175354,
      "rewards/xmlcount_reward_func/mean": 0.7015982270240784,
      "rewards/xmlcount_reward_func/std": 0.4568972587585449,
      "step": 669
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 425.0,
      "completions/mean_length": 165.20982360839844,
      "completions/mean_terminated_length": 141.57339477539062,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.22875677521232554,
      "grad_norm": 0.020127739757299423,
      "kl": 0.014457941055297852,
      "learning_rate": 2.9612829550614836e-06,
      "loss": -0.0052,
      "num_tokens": 238950803.0,
      "reward": 0.4565327763557434,
      "reward_std": 0.07144228368997574,
      "rewards/gemini_judge_reward_func/mean": 0.1897321492433548,
      "rewards/gemini_judge_reward_func/std": 0.30179867148399353,
      "rewards/semantic_correctness_reward_func/mean": 0.4736635982990265,
      "rewards/semantic_correctness_reward_func/std": 0.24912160634994507,
      "rewards/xmlcount_reward_func/mean": 0.7147678732872009,
      "rewards/xmlcount_reward_func/std": 0.45308414101600647,
      "step": 670
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 992.0,
      "completions/mean_length": 164.5,
      "completions/mean_terminated_length": 156.7567596435547,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.22909820323503052,
      "grad_norm": 0.020575378090143204,
      "kl": 0.015639543533325195,
      "learning_rate": 2.9461963542348737e-06,
      "loss": -0.0094,
      "num_tokens": 239289959.0,
      "reward": 0.4518532156944275,
      "reward_std": 0.07574018836021423,
      "rewards/gemini_judge_reward_func/mean": 0.1417410671710968,
      "rewards/gemini_judge_reward_func/std": 0.24999749660491943,
      "rewards/semantic_correctness_reward_func/mean": 0.4519446790218353,
      "rewards/semantic_correctness_reward_func/std": 0.20432503521442413,
      "rewards/xmlcount_reward_func/mean": 0.7619196772575378,
      "rewards/xmlcount_reward_func/std": 0.4264376759529114,
      "step": 671
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 304.0,
      "completions/mean_length": 157.3794708251953,
      "completions/mean_terminated_length": 149.57208251953125,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.22943963125773548,
      "grad_norm": 0.024331238120794296,
      "kl": 0.014272212982177734,
      "learning_rate": 2.931132213475884e-06,
      "loss": -0.015,
      "num_tokens": 239647616.0,
      "reward": 0.45021215081214905,
      "reward_std": 0.0597931370139122,
      "rewards/gemini_judge_reward_func/mean": 0.1205357164144516,
      "rewards/gemini_judge_reward_func/std": 0.225807324051857,
      "rewards/semantic_correctness_reward_func/mean": 0.43698903918266296,
      "rewards/semantic_correctness_reward_func/std": 0.15982688963413239,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 672
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 946.0,
      "completions/mean_length": 161.4241180419922,
      "completions/mean_terminated_length": 145.74090576171875,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.22978105928044043,
      "grad_norm": 0.020820723846554756,
      "kl": 0.016948699951171875,
      "learning_rate": 2.9160906975235493e-06,
      "loss": -0.0398,
      "num_tokens": 240000263.0,
      "reward": 0.40477100014686584,
      "reward_std": 0.06616359949111938,
      "rewards/gemini_judge_reward_func/mean": 0.1428571492433548,
      "rewards/gemini_judge_reward_func/std": 0.25873589515686035,
      "rewards/semantic_correctness_reward_func/mean": 0.4332745373249054,
      "rewards/semantic_correctness_reward_func/std": 0.19778446853160858,
      "rewards/xmlcount_reward_func/mean": 0.6524330973625183,
      "rewards/xmlcount_reward_func/std": 0.4732263386249542,
      "step": 673
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 175.1294708251953,
      "completions/mean_terminated_length": 147.7465362548828,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.2301224873031454,
      "grad_norm": 0.02019997127354145,
      "kl": 0.014962196350097656,
      "learning_rate": 2.9010719708694724e-06,
      "loss": -0.0229,
      "num_tokens": 240372648.0,
      "reward": 0.4191317856311798,
      "reward_std": 0.0710809975862503,
      "rewards/gemini_judge_reward_func/mean": 0.1361607164144516,
      "rewards/gemini_judge_reward_func/std": 0.26019221544265747,
      "rewards/semantic_correctness_reward_func/mean": 0.42908725142478943,
      "rewards/semantic_correctness_reward_func/std": 0.22505711019039154,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903892993927,
      "step": 674
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 852.0,
      "completions/mean_length": 153.46875,
      "completions/mean_terminated_length": 149.56503295898438,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.23046391532585037,
      "grad_norm": 0.019473010674118996,
      "kl": 0.01583242416381836,
      "learning_rate": 2.8860761977560435e-06,
      "loss": -0.0266,
      "num_tokens": 240725581.0,
      "reward": 0.4498962461948395,
      "reward_std": 0.07846318185329437,
      "rewards/gemini_judge_reward_func/mean": 0.1450892835855484,
      "rewards/gemini_judge_reward_func/std": 0.24749507009983063,
      "rewards/semantic_correctness_reward_func/mean": 0.4278828799724579,
      "rewards/semantic_correctness_reward_func/std": 0.20671235024929047,
      "rewards/xmlcount_reward_func/mean": 0.7657098770141602,
      "rewards/xmlcount_reward_func/std": 0.42325305938720703,
      "step": 675
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 887.0,
      "completions/mean_length": 168.2544708251953,
      "completions/mean_terminated_length": 152.69544982910156,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.23080534334855532,
      "grad_norm": 0.019685067236423492,
      "kl": 0.012958049774169922,
      "learning_rate": 2.871103542174637e-06,
      "loss": -0.0118,
      "num_tokens": 241073886.0,
      "reward": 0.45788535475730896,
      "reward_std": 0.06220375373959541,
      "rewards/gemini_judge_reward_func/mean": 0.1428571492433548,
      "rewards/gemini_judge_reward_func/std": 0.255465567111969,
      "rewards/semantic_correctness_reward_func/mean": 0.43071243166923523,
      "rewards/semantic_correctness_reward_func/std": 0.18672212958335876,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 676
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 404.0,
      "completions/mean_length": 158.4419708251953,
      "completions/mean_terminated_length": 146.6923065185547,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.2311467713712603,
      "grad_norm": 0.019380411133170128,
      "kl": 0.015169143676757812,
      "learning_rate": 2.8561541678638145e-06,
      "loss": -0.0014,
      "num_tokens": 241437493.0,
      "reward": 0.4243607223033905,
      "reward_std": 0.06035372614860535,
      "rewards/gemini_judge_reward_func/mean": 0.1383928507566452,
      "rewards/gemini_judge_reward_func/std": 0.23158472776412964,
      "rewards/semantic_correctness_reward_func/mean": 0.4288035035133362,
      "rewards/semantic_correctness_reward_func/std": 0.19333001971244812,
      "rewards/xmlcount_reward_func/mean": 0.7081071734428406,
      "rewards/xmlcount_reward_func/std": 0.45519739389419556,
      "step": 677
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 891.0,
      "completions/mean_length": 160.52232360839844,
      "completions/mean_terminated_length": 144.82272338867188,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.23148819939396525,
      "grad_norm": 0.02005195803940296,
      "kl": 0.015508174896240234,
      "learning_rate": 2.8412282383075362e-06,
      "loss": 0.0163,
      "num_tokens": 241799122.0,
      "reward": 0.4341495931148529,
      "reward_std": 0.06551965326070786,
      "rewards/gemini_judge_reward_func/mean": 0.1417410671710968,
      "rewards/gemini_judge_reward_func/std": 0.24317796528339386,
      "rewards/semantic_correctness_reward_func/mean": 0.4572656452655792,
      "rewards/semantic_correctness_reward_func/std": 0.20034907758235931,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 678
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 916.0,
      "completions/mean_length": 164.40625,
      "completions/mean_terminated_length": 148.7772674560547,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.23182962741667024,
      "grad_norm": 0.02083410508930683,
      "kl": 0.01825714111328125,
      "learning_rate": 2.826325916733378e-06,
      "loss": -0.0167,
      "num_tokens": 242142693.0,
      "reward": 0.4272557497024536,
      "reward_std": 0.06736288964748383,
      "rewards/gemini_judge_reward_func/mean": 0.1082589253783226,
      "rewards/gemini_judge_reward_func/std": 0.21504218876361847,
      "rewards/semantic_correctness_reward_func/mean": 0.42719826102256775,
      "rewards/semantic_correctness_reward_func/std": 0.22015966475009918,
      "rewards/xmlcount_reward_func/mean": 0.7462812662124634,
      "rewards/xmlcount_reward_func/std": 0.4369716942310333,
      "step": 679
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 355.0,
      "completions/mean_length": 159.64732360839844,
      "completions/mean_terminated_length": 147.91403198242188,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.2321710554393752,
      "grad_norm": 0.019481897354125977,
      "kl": 0.013461828231811523,
      "learning_rate": 2.811447366110741e-06,
      "loss": -0.0329,
      "num_tokens": 242488046.0,
      "reward": 0.44512689113616943,
      "reward_std": 0.06134679913520813,
      "rewards/gemini_judge_reward_func/mean": 0.1171875,
      "rewards/gemini_judge_reward_func/std": 0.22695417702198029,
      "rewards/semantic_correctness_reward_func/mean": 0.4540092945098877,
      "rewards/semantic_correctness_reward_func/std": 0.19972579181194305,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 680
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 345.0,
      "completions/mean_length": 163.79464721679688,
      "completions/mean_terminated_length": 144.15524291992188,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.23251248346208014,
      "grad_norm": 0.020205752924084663,
      "kl": 0.016667842864990234,
      "learning_rate": 2.796592749149071e-06,
      "loss": -0.0165,
      "num_tokens": 242860544.0,
      "reward": 0.4280025064945221,
      "reward_std": 0.07606809586286545,
      "rewards/gemini_judge_reward_func/mean": 0.1316964328289032,
      "rewards/gemini_judge_reward_func/std": 0.2656741440296173,
      "rewards/semantic_correctness_reward_func/mean": 0.4257535934448242,
      "rewards/semantic_correctness_reward_func/std": 0.19858862459659576,
      "rewards/xmlcount_reward_func/mean": 0.7254330515861511,
      "rewards/xmlcount_reward_func/std": 0.446159690618515,
      "step": 681
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 319.0,
      "completions/mean_length": 163.28125,
      "completions/mean_terminated_length": 143.630126953125,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.23285391148478513,
      "grad_norm": 0.021477343514561653,
      "kl": 0.024187564849853516,
      "learning_rate": 2.7817622282960816e-06,
      "loss": -0.0087,
      "num_tokens": 243204363.0,
      "reward": 0.4583207070827484,
      "reward_std": 0.07647057622671127,
      "rewards/gemini_judge_reward_func/mean": 0.1662946492433548,
      "rewards/gemini_judge_reward_func/std": 0.2740088105201721,
      "rewards/semantic_correctness_reward_func/mean": 0.44556760787963867,
      "rewards/semantic_correctness_reward_func/std": 0.21110759675502777,
      "rewards/xmlcount_reward_func/mean": 0.7567232847213745,
      "rewards/xmlcount_reward_func/std": 0.4276689887046814,
      "step": 682
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 997.0,
      "completions/mean_length": 167.50894165039062,
      "completions/mean_terminated_length": 143.93577575683594,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.23319533950749008,
      "grad_norm": 0.02061975747346878,
      "kl": 0.019166946411132812,
      "learning_rate": 2.766955965735968e-06,
      "loss": -0.047,
      "num_tokens": 243585773.0,
      "reward": 0.4028172791004181,
      "reward_std": 0.06361385434865952,
      "rewards/gemini_judge_reward_func/mean": 0.1450892835855484,
      "rewards/gemini_judge_reward_func/std": 0.27023160457611084,
      "rewards/semantic_correctness_reward_func/mean": 0.4413006007671356,
      "rewards/semantic_correctness_reward_func/std": 0.1913142055273056,
      "rewards/xmlcount_reward_func/mean": 0.6413035988807678,
      "rewards/xmlcount_reward_func/std": 0.48018917441368103,
      "step": 683
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 643.0,
      "completions/mean_length": 153.92857360839844,
      "completions/mean_terminated_length": 150.02691650390625,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.23353676753019503,
      "grad_norm": 0.020191872492432594,
      "kl": 0.014515876770019531,
      "learning_rate": 2.7521741233876496e-06,
      "loss": 0.0003,
      "num_tokens": 243900057.0,
      "reward": 0.46831250190734863,
      "reward_std": 0.0564018189907074,
      "rewards/gemini_judge_reward_func/mean": 0.1796875,
      "rewards/gemini_judge_reward_func/std": 0.3019375205039978,
      "rewards/semantic_correctness_reward_func/mean": 0.4449373781681061,
      "rewards/semantic_correctness_reward_func/std": 0.2366136610507965,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 684
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 883.0,
      "completions/mean_length": 148.47769165039062,
      "completions/mean_terminated_length": 144.55157470703125,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.23387819555290001,
      "grad_norm": 0.02208324708044529,
      "kl": 0.01886892318725586,
      "learning_rate": 2.7374168629029814e-06,
      "loss": -0.0058,
      "num_tokens": 244234772.0,
      "reward": 0.4652222692966461,
      "reward_std": 0.06461701542139053,
      "rewards/gemini_judge_reward_func/mean": 0.1417410671710968,
      "rewards/gemini_judge_reward_func/std": 0.2745744585990906,
      "rewards/semantic_correctness_reward_func/mean": 0.4338791072368622,
      "rewards/semantic_correctness_reward_func/std": 0.23112650215625763,
      "rewards/xmlcount_reward_func/mean": 0.8043751120567322,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 685
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1009.0,
      "completions/mean_length": 177.38394165039062,
      "completions/mean_terminated_length": 154.0825653076172,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.23421962357560497,
      "grad_norm": 0.020363980904221535,
      "kl": 0.015145301818847656,
      "learning_rate": 2.722684345665004e-06,
      "loss": -0.0154,
      "num_tokens": 244596058.0,
      "reward": 0.44785916805267334,
      "reward_std": 0.0589757114648819,
      "rewards/gemini_judge_reward_func/mean": 0.1383928507566452,
      "rewards/gemini_judge_reward_func/std": 0.24794963002204895,
      "rewards/semantic_correctness_reward_func/mean": 0.42526012659072876,
      "rewards/semantic_correctness_reward_func/std": 0.21264635026454926,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 686
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 516.0,
      "completions/mean_length": 160.88394165039062,
      "completions/mean_terminated_length": 145.19090270996094,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.23456105159830992,
      "grad_norm": 0.02010117843747139,
      "kl": 0.013428688049316406,
      "learning_rate": 2.707976732786166e-06,
      "loss": -0.006,
      "num_tokens": 244951076.0,
      "reward": 0.46008288860321045,
      "reward_std": 0.06361074000597,
      "rewards/gemini_judge_reward_func/mean": 0.109375,
      "rewards/gemini_judge_reward_func/std": 0.21381975710391998,
      "rewards/semantic_correctness_reward_func/mean": 0.43716415762901306,
      "rewards/semantic_correctness_reward_func/std": 0.2125110626220703,
      "rewards/xmlcount_reward_func/mean": 0.8222500681877136,
      "rewards/xmlcount_reward_func/std": 0.3842346966266632,
      "step": 687
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 979.0,
      "completions/mean_length": 169.5044708251953,
      "completions/mean_terminated_length": 153.96817016601562,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.2349024796210149,
      "grad_norm": 0.01886354200541973,
      "kl": 0.015913724899291992,
      "learning_rate": 2.693294185106562e-06,
      "loss": 0.0221,
      "num_tokens": 245285709.0,
      "reward": 0.45489609241485596,
      "reward_std": 0.06527674198150635,
      "rewards/gemini_judge_reward_func/mean": 0.1127232164144516,
      "rewards/gemini_judge_reward_func/std": 0.22675560414791107,
      "rewards/semantic_correctness_reward_func/mean": 0.4760338366031647,
      "rewards/semantic_correctness_reward_func/std": 0.19940443336963654,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 688
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 360.0,
      "completions/mean_length": 162.21875,
      "completions/mean_terminated_length": 138.5,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.23524390764371986,
      "grad_norm": 0.02161382883787155,
      "kl": 0.01592111587524414,
      "learning_rate": 2.678636863192184e-06,
      "loss": 0.0057,
      "num_tokens": 245645274.0,
      "reward": 0.44552478194236755,
      "reward_std": 0.06137440726161003,
      "rewards/gemini_judge_reward_func/mean": 0.0959821417927742,
      "rewards/gemini_judge_reward_func/std": 0.19305641949176788,
      "rewards/semantic_correctness_reward_func/mean": 0.4626862704753876,
      "rewards/semantic_correctness_reward_func/std": 0.20750680565834045,
      "rewards/xmlcount_reward_func/mean": 0.7864866256713867,
      "rewards/xmlcount_reward_func/std": 0.4116491377353668,
      "step": 689
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 556.0,
      "completions/mean_length": 149.97769165039062,
      "completions/mean_terminated_length": 138.11312866210938,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.2355853356664248,
      "grad_norm": 0.022082503885030746,
      "kl": 0.016037464141845703,
      "learning_rate": 2.6640049273331516e-06,
      "loss": 0.024,
      "num_tokens": 246015097.0,
      "reward": 0.42489418387413025,
      "reward_std": 0.07650549709796906,
      "rewards/gemini_judge_reward_func/mean": 0.1741071492433548,
      "rewards/gemini_judge_reward_func/std": 0.2944541275501251,
      "rewards/semantic_correctness_reward_func/mean": 0.4535064101219177,
      "rewards/semantic_correctness_reward_func/std": 0.22471977770328522,
      "rewards/xmlcount_reward_func/mean": 0.6613750457763672,
      "rewards/xmlcount_reward_func/std": 0.47500187158584595,
      "step": 690
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 936.0,
      "completions/mean_length": 152.14732360839844,
      "completions/mean_terminated_length": 148.2376708984375,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.2359267636891298,
      "grad_norm": 0.020293327048420906,
      "kl": 0.014290809631347656,
      "learning_rate": 2.649398537541978e-06,
      "loss": -0.0307,
      "num_tokens": 246355270.0,
      "reward": 0.44305023550987244,
      "reward_std": 0.06119895726442337,
      "rewards/gemini_judge_reward_func/mean": 0.1607142835855484,
      "rewards/gemini_judge_reward_func/std": 0.2944881021976471,
      "rewards/semantic_correctness_reward_func/mean": 0.48290279507637024,
      "rewards/semantic_correctness_reward_func/std": 0.22012221813201904,
      "rewards/xmlcount_reward_func/mean": 0.7054598927497864,
      "rewards/xmlcount_reward_func/std": 0.455239862203598,
      "step": 691
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 428.0,
      "completions/mean_length": 168.16519165039062,
      "completions/mean_terminated_length": 148.6255645751953,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.23626819171183475,
      "grad_norm": 0.021590879186987877,
      "kl": 0.015369415283203125,
      "learning_rate": 2.6348178535517967e-06,
      "loss": 0.0018,
      "num_tokens": 246698131.0,
      "reward": 0.4446691870689392,
      "reward_std": 0.07133954018354416,
      "rewards/gemini_judge_reward_func/mean": 0.1339285671710968,
      "rewards/gemini_judge_reward_func/std": 0.249278262257576,
      "rewards/semantic_correctness_reward_func/mean": 0.41823869943618774,
      "rewards/semantic_correctness_reward_func/std": 0.191751629114151,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 692
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 878.0,
      "completions/mean_length": 169.8482208251953,
      "completions/mean_terminated_length": 154.31817626953125,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.2366096197345397,
      "grad_norm": 0.020112143829464912,
      "kl": 0.013370513916015625,
      "learning_rate": 2.6202630348146323e-06,
      "loss": 0.0207,
      "num_tokens": 247018465.0,
      "reward": 0.4661848545074463,
      "reward_std": 0.06909771263599396,
      "rewards/gemini_judge_reward_func/mean": 0.1316964328289032,
      "rewards/gemini_judge_reward_func/std": 0.21297545731067657,
      "rewards/semantic_correctness_reward_func/mean": 0.45878109335899353,
      "rewards/semantic_correctness_reward_func/std": 0.19541005790233612,
      "rewards/xmlcount_reward_func/mean": 0.8043750524520874,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 693
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 514.0,
      "completions/mean_length": 169.07144165039062,
      "completions/mean_terminated_length": 141.4930877685547,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.23695104775724468,
      "grad_norm": 0.01998029090464115,
      "kl": 0.016425132751464844,
      "learning_rate": 2.605734240499652e-06,
      "loss": -0.0034,
      "num_tokens": 247406061.0,
      "reward": 0.38997000455856323,
      "reward_std": 0.06167588382959366,
      "rewards/gemini_judge_reward_func/mean": 0.1104910746216774,
      "rewards/gemini_judge_reward_func/std": 0.22035281360149384,
      "rewards/semantic_correctness_reward_func/mean": 0.4418678879737854,
      "rewards/semantic_correctness_reward_func/std": 0.2028319090604782,
      "rewards/xmlcount_reward_func/mean": 0.643500030040741,
      "rewards/xmlcount_reward_func/std": 0.48071083426475525,
      "step": 694
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 351.0,
      "completions/mean_length": 157.0491180419922,
      "completions/mean_terminated_length": 141.28636169433594,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.23729247577994964,
      "grad_norm": 0.020776091143488884,
      "kl": 0.015616416931152344,
      "learning_rate": 2.5912316294914232e-06,
      "loss": -0.0041,
      "num_tokens": 247761004.0,
      "reward": 0.440807580947876,
      "reward_std": 0.06188865751028061,
      "rewards/gemini_judge_reward_func/mean": 0.1272321492433548,
      "rewards/gemini_judge_reward_func/std": 0.2582615911960602,
      "rewards/semantic_correctness_reward_func/mean": 0.41232332587242126,
      "rewards/semantic_correctness_reward_func/std": 0.23134461045265198,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 695
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 349.0,
      "completions/mean_length": 153.66964721679688,
      "completions/mean_terminated_length": 145.82882690429688,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.2376339038026546,
      "grad_norm": 0.021136008203029633,
      "kl": 0.01209259033203125,
      "learning_rate": 2.576755360388177e-06,
      "loss": -0.0243,
      "num_tokens": 248112446.0,
      "reward": 0.4230984151363373,
      "reward_std": 0.06140593811869621,
      "rewards/gemini_judge_reward_func/mean": 0.1004464253783226,
      "rewards/gemini_judge_reward_func/std": 0.1936776041984558,
      "rewards/semantic_correctness_reward_func/mean": 0.4130990207195282,
      "rewards/semantic_correctness_reward_func/std": 0.17426711320877075,
      "rewards/xmlcount_reward_func/mean": 0.7507500648498535,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 696
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 825.0,
      "completions/mean_length": 159.65179443359375,
      "completions/mean_terminated_length": 143.9363555908203,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.23797533182535957,
      "grad_norm": 0.020741021260619164,
      "kl": 0.01749563217163086,
      "learning_rate": 2.562305591500069e-06,
      "loss": -0.0072,
      "num_tokens": 248460224.0,
      "reward": 0.43691644072532654,
      "reward_std": 0.06515128910541534,
      "rewards/gemini_judge_reward_func/mean": 0.140625,
      "rewards/gemini_judge_reward_func/std": 0.27565911412239075,
      "rewards/semantic_correctness_reward_func/mean": 0.43758201599121094,
      "rewards/semantic_correctness_reward_func/std": 0.23180371522903442,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 697
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 872.0,
      "completions/mean_length": 162.0669708251953,
      "completions/mean_terminated_length": 154.3018035888672,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.23831675984806452,
      "grad_norm": 0.021172866225242615,
      "kl": 0.01595783233642578,
      "learning_rate": 2.5478824808474613e-06,
      "loss": -0.0077,
      "num_tokens": 248800735.0,
      "reward": 0.48907050490379333,
      "reward_std": 0.07695534080266953,
      "rewards/gemini_judge_reward_func/mean": 0.1662946492433548,
      "rewards/gemini_judge_reward_func/std": 0.2830647826194763,
      "rewards/semantic_correctness_reward_func/mean": 0.5040131211280823,
      "rewards/semantic_correctness_reward_func/std": 0.20649151504039764,
      "rewards/xmlcount_reward_func/mean": 0.8043750524520874,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 698
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 163.19644165039062,
      "completions/mean_terminated_length": 135.42857360839844,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.2386581878707695,
      "grad_norm": 0.020416466519236565,
      "kl": 0.016710758209228516,
      "learning_rate": 2.5334861861591753e-06,
      "loss": -0.0458,
      "num_tokens": 249173807.0,
      "reward": 0.38040465116500854,
      "reward_std": 0.0654895082116127,
      "rewards/gemini_judge_reward_func/mean": 0.1395089328289032,
      "rewards/gemini_judge_reward_func/std": 0.27367985248565674,
      "rewards/semantic_correctness_reward_func/mean": 0.4075053632259369,
      "rewards/semantic_correctness_reward_func/std": 0.22781768441200256,
      "rewards/xmlcount_reward_func/mean": 0.6077500581741333,
      "rewards/xmlcount_reward_func/std": 0.48996880650520325,
      "step": 699
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 561.0,
      "completions/mean_length": 157.02679443359375,
      "completions/mean_terminated_length": 153.13902282714844,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.23899961589347446,
      "grad_norm": 0.020300550386309624,
      "kl": 0.01459813117980957,
      "learning_rate": 2.5191168648707888e-06,
      "loss": -0.0093,
      "num_tokens": 249524129.0,
      "reward": 0.4497314989566803,
      "reward_std": 0.06387756019830704,
      "rewards/gemini_judge_reward_func/mean": 0.1261160671710968,
      "rewards/gemini_judge_reward_func/std": 0.22584888339042664,
      "rewards/semantic_correctness_reward_func/mean": 0.45917510986328125,
      "rewards/semantic_correctness_reward_func/std": 0.22504274547100067,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 700
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 900.0,
      "completions/max_terminated_length": 900.0,
      "completions/mean_length": 144.76339721679688,
      "completions/mean_terminated_length": 144.76339721679688,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.2393410439161794,
      "grad_norm": 0.020862894132733345,
      "kl": 0.01591944694519043,
      "learning_rate": 2.5047746741228977e-06,
      "loss": -0.0147,
      "num_tokens": 249899300.0,
      "reward": 0.4328038990497589,
      "reward_std": 0.0568770207464695,
      "rewards/gemini_judge_reward_func/mean": 0.1116071417927742,
      "rewards/gemini_judge_reward_func/std": 0.21397185325622559,
      "rewards/semantic_correctness_reward_func/mean": 0.43934059143066406,
      "rewards/semantic_correctness_reward_func/std": 0.2161037027835846,
      "rewards/xmlcount_reward_func/mean": 0.7507321238517761,
      "rewards/xmlcount_reward_func/std": 0.43440625071525574,
      "step": 701
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 934.0,
      "completions/mean_length": 156.40625,
      "completions/mean_terminated_length": 144.62896728515625,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.2396824719388844,
      "grad_norm": 0.020461153239011765,
      "kl": 0.020430326461791992,
      "learning_rate": 2.490459770759398e-06,
      "loss": -0.0092,
      "num_tokens": 250251623.0,
      "reward": 0.4267743229866028,
      "reward_std": 0.053675100207328796,
      "rewards/gemini_judge_reward_func/mean": 0.1506696492433548,
      "rewards/gemini_judge_reward_func/std": 0.2799535393714905,
      "rewards/semantic_correctness_reward_func/mean": 0.4382820129394531,
      "rewards/semantic_correctness_reward_func/std": 0.22525343298912048,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903594970703,
      "step": 702
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 434.0,
      "completions/mean_length": 155.1294708251953,
      "completions/mean_terminated_length": 143.33485412597656,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.24002389996158935,
      "grad_norm": 0.022830575704574585,
      "kl": 0.015569210052490234,
      "learning_rate": 2.476172311325783e-06,
      "loss": -0.0099,
      "num_tokens": 250598840.0,
      "reward": 0.46036437153816223,
      "reward_std": 0.0712478905916214,
      "rewards/gemini_judge_reward_func/mean": 0.1640625,
      "rewards/gemini_judge_reward_func/std": 0.28239211440086365,
      "rewards/semantic_correctness_reward_func/mean": 0.4364466667175293,
      "rewards/semantic_correctness_reward_func/std": 0.21228350698947906,
      "rewards/xmlcount_reward_func/mean": 0.7686249613761902,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 703
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 757.0,
      "completions/mean_length": 157.1294708251953,
      "completions/mean_terminated_length": 149.31982421875,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.2403653279842943,
      "grad_norm": 0.022787317633628845,
      "kl": 0.017244815826416016,
      "learning_rate": 2.461912452067415e-06,
      "loss": -0.0304,
      "num_tokens": 250945797.0,
      "reward": 0.4652026295661926,
      "reward_std": 0.06580142676830292,
      "rewards/gemini_judge_reward_func/mean": 0.1450892835855484,
      "rewards/gemini_judge_reward_func/std": 0.27536845207214355,
      "rewards/semantic_correctness_reward_func/mean": 0.4717719852924347,
      "rewards/semantic_correctness_reward_func/std": 0.23088571429252625,
      "rewards/xmlcount_reward_func/mean": 0.782031238079071,
      "rewards/xmlcount_reward_func/std": 0.41473886370658875,
      "step": 704
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 304.0,
      "completions/mean_length": 149.07144165039062,
      "completions/mean_terminated_length": 133.16363525390625,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.24070675600699928,
      "grad_norm": 0.02126486226916313,
      "kl": 0.01680445671081543,
      "learning_rate": 2.447680348927837e-06,
      "loss": -0.0002,
      "num_tokens": 251297597.0,
      "reward": 0.3836461007595062,
      "reward_std": 0.047221578657627106,
      "rewards/gemini_judge_reward_func/mean": 0.1104910746216774,
      "rewards/gemini_judge_reward_func/std": 0.21389874815940857,
      "rewards/semantic_correctness_reward_func/mean": 0.374498188495636,
      "rewards/semantic_correctness_reward_func/std": 0.17782600224018097,
      "rewards/xmlcount_reward_func/mean": 0.6613750457763672,
      "rewards/xmlcount_reward_func/std": 0.47500184178352356,
      "step": 705
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 767.0,
      "completions/mean_length": 166.7678680419922,
      "completions/mean_terminated_length": 147.1963348388672,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.24104818402970424,
      "grad_norm": 0.019322630017995834,
      "kl": 0.015539407730102539,
      "learning_rate": 2.433476157547044e-06,
      "loss": -0.008,
      "num_tokens": 251664097.0,
      "reward": 0.42840591073036194,
      "reward_std": 0.07416719198226929,
      "rewards/gemini_judge_reward_func/mean": 0.1986607164144516,
      "rewards/gemini_judge_reward_func/std": 0.3125520348548889,
      "rewards/semantic_correctness_reward_func/mean": 0.4934581220149994,
      "rewards/semantic_correctness_reward_func/std": 0.23917879164218903,
      "rewards/xmlcount_reward_func/mean": 0.6256250739097595,
      "rewards/xmlcount_reward_func/std": 0.48569241166114807,
      "step": 706
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 405.0,
      "completions/mean_length": 154.90625,
      "completions/mean_terminated_length": 151.00897216796875,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.2413896120524092,
      "grad_norm": 0.0201317947357893,
      "kl": 0.01163625717163086,
      "learning_rate": 2.4193000332597984e-06,
      "loss": -0.0167,
      "num_tokens": 251992972.0,
      "reward": 0.4662090837955475,
      "reward_std": 0.06485062837600708,
      "rewards/gemini_judge_reward_func/mean": 0.1261160671710968,
      "rewards/gemini_judge_reward_func/std": 0.2437535524368286,
      "rewards/semantic_correctness_reward_func/mean": 0.434313029050827,
      "rewards/semantic_correctness_reward_func/std": 0.20111876726150513,
      "rewards/xmlcount_reward_func/mean": 0.8222500681877136,
      "rewards/xmlcount_reward_func/std": 0.3842346966266632,
      "step": 707
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 642.0,
      "completions/mean_length": 169.05357360839844,
      "completions/mean_terminated_length": 153.5090789794922,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.24173104007511417,
      "grad_norm": 0.0201317947357893,
      "kl": 0.015091657638549805,
      "learning_rate": 2.4193000332597984e-06,
      "loss": 0.0044,
      "num_tokens": 252365404.0,
      "reward": 0.4412159323692322,
      "reward_std": 0.06695268303155899,
      "rewards/gemini_judge_reward_func/mean": 0.1506696492433548,
      "rewards/gemini_judge_reward_func/std": 0.2718265652656555,
      "rewards/semantic_correctness_reward_func/mean": 0.47249937057495117,
      "rewards/semantic_correctness_reward_func/std": 0.22053788602352142,
      "rewards/xmlcount_reward_func/mean": 0.7161206007003784,
      "rewards/xmlcount_reward_func/std": 0.4517506957054138,
      "step": 708
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 268.0,
      "completions/mean_length": 147.90625,
      "completions/mean_terminated_length": 143.9775848388672,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.24207246809781913,
      "grad_norm": 0.019735103473067284,
      "kl": 0.017200469970703125,
      "learning_rate": 2.4051521310939258e-06,
      "loss": 0.0044,
      "num_tokens": 252725691.0,
      "reward": 0.4373900890350342,
      "reward_std": 0.07390647381544113,
      "rewards/gemini_judge_reward_func/mean": 0.1049107164144516,
      "rewards/gemini_judge_reward_func/std": 0.22370301187038422,
      "rewards/semantic_correctness_reward_func/mean": 0.404128760099411,
      "rewards/semantic_correctness_reward_func/std": 0.1993720382452011,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 709
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0401785714285714,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 535.0,
      "completions/mean_length": 180.4553680419922,
      "completions/mean_terminated_length": 145.14418029785156,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.24241389612052408,
      "grad_norm": 0.01987229473888874,
      "kl": 0.015630483627319336,
      "learning_rate": 2.391032605768613e-06,
      "loss": -0.0101,
      "num_tokens": 253089801.0,
      "reward": 0.4440220594406128,
      "reward_std": 0.07133130729198456,
      "rewards/gemini_judge_reward_func/mean": 0.1607142835855484,
      "rewards/gemini_judge_reward_func/std": 0.2778382897377014,
      "rewards/semantic_correctness_reward_func/mean": 0.47985124588012695,
      "rewards/semantic_correctness_reward_func/std": 0.21601252257823944,
      "rewards/xmlcount_reward_func/mean": 0.7094151377677917,
      "rewards/xmlcount_reward_func/std": 0.4536706507205963,
      "step": 710
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 558.0,
      "completions/mean_length": 169.70982360839844,
      "completions/mean_terminated_length": 142.15206909179688,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.24275532414322906,
      "grad_norm": 0.02286173216998577,
      "kl": 0.019670486450195312,
      "learning_rate": 2.3769416116927335e-06,
      "loss": -0.0615,
      "num_tokens": 253479884.0,
      "reward": 0.3929518163204193,
      "reward_std": 0.05669989064335823,
      "rewards/gemini_judge_reward_func/mean": 0.109375,
      "rewards/gemini_judge_reward_func/std": 0.24781839549541473,
      "rewards/semantic_correctness_reward_func/mean": 0.4232589900493622,
      "rewards/semantic_correctness_reward_func/std": 0.19399282336235046,
      "rewards/xmlcount_reward_func/mean": 0.6613750457763672,
      "rewards/xmlcount_reward_func/std": 0.47500187158584595,
      "step": 711
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 405.0,
      "completions/mean_length": 159.66519165039062,
      "completions/mean_terminated_length": 147.93212890625,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.24309675216593402,
      "grad_norm": 0.019998321309685707,
      "kl": 0.01382303237915039,
      "learning_rate": 2.3628793029631353e-06,
      "loss": -0.0188,
      "num_tokens": 253810629.0,
      "reward": 0.4336186945438385,
      "reward_std": 0.05938807874917984,
      "rewards/gemini_judge_reward_func/mean": 0.1082589253783226,
      "rewards/gemini_judge_reward_func/std": 0.20976413786411285,
      "rewards/semantic_correctness_reward_func/mean": 0.40090590715408325,
      "rewards/semantic_correctness_reward_func/std": 0.19256816804409027,
      "rewards/xmlcount_reward_func/mean": 0.7753348350524902,
      "rewards/xmlcount_reward_func/std": 0.4151875972747803,
      "step": 712
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 597.0,
      "completions/max_terminated_length": 597.0,
      "completions/mean_length": 149.82589721679688,
      "completions/mean_terminated_length": 149.82589721679688,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.24343818018863897,
      "grad_norm": 0.02148658223450184,
      "kl": 0.014288663864135742,
      "learning_rate": 2.3488458333629777e-06,
      "loss": 0.013,
      "num_tokens": 254159642.0,
      "reward": 0.47455543279647827,
      "reward_std": 0.06833560764789581,
      "rewards/gemini_judge_reward_func/mean": 0.1171875,
      "rewards/gemini_judge_reward_func/std": 0.22695417702198029,
      "rewards/semantic_correctness_reward_func/mean": 0.42240217328071594,
      "rewards/semantic_correctness_reward_func/std": 0.2225671112537384,
      "rewards/xmlcount_reward_func/mean": 0.8580000996589661,
      "rewards/xmlcount_reward_func/std": 0.35106155276298523,
      "step": 713
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 483.0,
      "completions/mean_length": 148.87054443359375,
      "completions/mean_terminated_length": 140.9864959716797,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.24377960821134395,
      "grad_norm": 0.020947393029928207,
      "kl": 0.01793670654296875,
      "learning_rate": 2.3348413563600324e-06,
      "loss": 0.0081,
      "num_tokens": 254523265.0,
      "reward": 0.42428773641586304,
      "reward_std": 0.06304830312728882,
      "rewards/gemini_judge_reward_func/mean": 0.1473214328289032,
      "rewards/gemini_judge_reward_func/std": 0.2802416682243347,
      "rewards/semantic_correctness_reward_func/mean": 0.43254581093788147,
      "rewards/semantic_correctness_reward_func/std": 0.20892907679080963,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903594970703,
      "step": 714
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 903.0,
      "completions/mean_length": 169.25894165039062,
      "completions/mean_terminated_length": 141.68663024902344,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.2441210362340489,
      "grad_norm": 0.019883327186107635,
      "kl": 0.014074325561523438,
      "learning_rate": 2.320866025105016e-06,
      "loss": -0.0148,
      "num_tokens": 254898227.0,
      "reward": 0.39803096652030945,
      "reward_std": 0.06506957113742828,
      "rewards/gemini_judge_reward_func/mean": 0.1272321492433548,
      "rewards/gemini_judge_reward_func/std": 0.21568416059017181,
      "rewards/semantic_correctness_reward_func/mean": 0.4475386440753937,
      "rewards/semantic_correctness_reward_func/std": 0.20398251712322235,
      "rewards/xmlcount_reward_func/mean": 0.6440759301185608,
      "rewards/xmlcount_reward_func/std": 0.4800132215023041,
      "step": 715
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 807.0,
      "completions/mean_length": 166.8794708251953,
      "completions/mean_terminated_length": 151.2954559326172,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.2444624642567539,
      "grad_norm": 0.019473331049084663,
      "kl": 0.014130592346191406,
      "learning_rate": 2.3069199924299175e-06,
      "loss": -0.0324,
      "num_tokens": 255253776.0,
      "reward": 0.45115411281585693,
      "reward_std": 0.07554265856742859,
      "rewards/gemini_judge_reward_func/mean": 0.1495535671710968,
      "rewards/gemini_judge_reward_func/std": 0.2656741738319397,
      "rewards/semantic_correctness_reward_func/mean": 0.4472883641719818,
      "rewards/semantic_correctness_reward_func/std": 0.21093684434890747,
      "rewards/xmlcount_reward_func/mean": 0.7546876072883606,
      "rewards/xmlcount_reward_func/std": 0.42363569140434265,
      "step": 716
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 361.0,
      "completions/mean_length": 160.46429443359375,
      "completions/mean_terminated_length": 148.74208068847656,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.24480389227945884,
      "grad_norm": 0.022255755960941315,
      "kl": 0.012172698974609375,
      "learning_rate": 2.29300341084631e-06,
      "loss": -0.0087,
      "num_tokens": 255613864.0,
      "reward": 0.48112553358078003,
      "reward_std": 0.06726916879415512,
      "rewards/gemini_judge_reward_func/mean": 0.1897321492433548,
      "rewards/gemini_judge_reward_func/std": 0.3276248574256897,
      "rewards/semantic_correctness_reward_func/mean": 0.48891332745552063,
      "rewards/semantic_correctness_reward_func/std": 0.24710066616535187,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 717
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 959.0,
      "completions/mean_length": 167.55804443359375,
      "completions/mean_terminated_length": 151.98635864257812,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.2451453203021638,
      "grad_norm": 0.01919080689549446,
      "kl": 0.016460418701171875,
      "learning_rate": 2.2791164325437047e-06,
      "loss": 0.014,
      "num_tokens": 255962745.0,
      "reward": 0.43085789680480957,
      "reward_std": 0.06295596063137054,
      "rewards/gemini_judge_reward_func/mean": 0.1651785671710968,
      "rewards/gemini_judge_reward_func/std": 0.2900702953338623,
      "rewards/semantic_correctness_reward_func/mean": 0.4296819865703583,
      "rewards/semantic_correctness_reward_func/std": 0.2027299851179123,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903892993927,
      "step": 718
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 672.0,
      "completions/mean_length": 161.13394165039062,
      "completions/mean_terminated_length": 145.44544982910156,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.24548674832486878,
      "grad_norm": 0.021129751577973366,
      "kl": 0.017368555068969727,
      "learning_rate": 2.265259209387867e-06,
      "loss": -0.0283,
      "num_tokens": 256321887.0,
      "reward": 0.4192156493663788,
      "reward_std": 0.07224642485380173,
      "rewards/gemini_judge_reward_func/mean": 0.1238839253783226,
      "rewards/gemini_judge_reward_func/std": 0.2117588222026825,
      "rewards/semantic_correctness_reward_func/mean": 0.4812476933002472,
      "rewards/semantic_correctness_reward_func/std": 0.20846113562583923,
      "rewards/xmlcount_reward_func/mean": 0.6835312843322754,
      "rewards/xmlcount_reward_func/std": 0.47700417041778564,
      "step": 719
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 927.0,
      "completions/mean_length": 163.7991180419922,
      "completions/mean_terminated_length": 159.94171142578125,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.24582817634757373,
      "grad_norm": 0.020381838083267212,
      "kl": 0.013567447662353516,
      "learning_rate": 2.2514318929191707e-06,
      "loss": -0.0257,
      "num_tokens": 256673270.0,
      "reward": 0.4235455095767975,
      "reward_std": 0.07512028515338898,
      "rewards/gemini_judge_reward_func/mean": 0.1138392835855484,
      "rewards/gemini_judge_reward_func/std": 0.21146614849567413,
      "rewards/semantic_correctness_reward_func/mean": 0.42429882287979126,
      "rewards/semantic_correctness_reward_func/std": 0.1959509402513504,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 720
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 464.0,
      "completions/mean_length": 154.12054443359375,
      "completions/mean_terminated_length": 146.28378295898438,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.24616960437027868,
      "grad_norm": 0.021473728120326996,
      "kl": 0.017746925354003906,
      "learning_rate": 2.2376346343509343e-06,
      "loss": 0.0081,
      "num_tokens": 257008805.0,
      "reward": 0.45415744185447693,
      "reward_std": 0.0672735646367073,
      "rewards/gemini_judge_reward_func/mean": 0.1372767835855484,
      "rewards/gemini_judge_reward_func/std": 0.22551624476909637,
      "rewards/semantic_correctness_reward_func/mean": 0.4321710169315338,
      "rewards/semantic_correctness_reward_func/std": 0.21166691184043884,
      "rewards/xmlcount_reward_func/mean": 0.782031238079071,
      "rewards/xmlcount_reward_func/std": 0.41473886370658875,
      "step": 721
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 409.0,
      "completions/mean_length": 172.3794708251953,
      "completions/mean_terminated_length": 144.9078369140625,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.24651103239298366,
      "grad_norm": 0.020387083292007446,
      "kl": 0.014590740203857422,
      "learning_rate": 2.2238675845677663e-06,
      "loss": -0.0239,
      "num_tokens": 257348598.0,
      "reward": 0.4348061680793762,
      "reward_std": 0.04860123619437218,
      "rewards/gemini_judge_reward_func/mean": 0.1328125,
      "rewards/gemini_judge_reward_func/std": 0.2424771934747696,
      "rewards/semantic_correctness_reward_func/mean": 0.44265562295913696,
      "rewards/semantic_correctness_reward_func/std": 0.21456462144851685,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 722
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 617.0,
      "completions/mean_length": 150.91964721679688,
      "completions/mean_terminated_length": 135.0454559326172,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.24685246041568862,
      "grad_norm": 0.024489011615514755,
      "kl": 0.017383098602294922,
      "learning_rate": 2.2101308941239204e-06,
      "loss": -0.0112,
      "num_tokens": 257698000.0,
      "reward": 0.4203735888004303,
      "reward_std": 0.07543018460273743,
      "rewards/gemini_judge_reward_func/mean": 0.1584821492433548,
      "rewards/gemini_judge_reward_func/std": 0.2928435206413269,
      "rewards/semantic_correctness_reward_func/mean": 0.4264035224914551,
      "rewards/semantic_correctness_reward_func/std": 0.22055880725383759,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 723
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 303.0,
      "completions/mean_length": 157.49554443359375,
      "completions/mean_terminated_length": 149.68919372558594,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.24719388843839357,
      "grad_norm": 0.022120485082268715,
      "kl": 0.013323783874511719,
      "learning_rate": 2.1964247132416373e-06,
      "loss": -0.0106,
      "num_tokens": 258037219.0,
      "reward": 0.4730894863605499,
      "reward_std": 0.060390159487724304,
      "rewards/gemini_judge_reward_func/mean": 0.1495535671710968,
      "rewards/gemini_judge_reward_func/std": 0.2635558545589447,
      "rewards/semantic_correctness_reward_func/mean": 0.4575989544391632,
      "rewards/semantic_correctness_reward_func/std": 0.22946356236934662,
      "rewards/xmlcount_reward_func/mean": 0.8043705821037292,
      "rewards/xmlcount_reward_func/std": 0.3985821604728699,
      "step": 724
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 579.0,
      "completions/mean_length": 158.94644165039062,
      "completions/mean_terminated_length": 143.21817016601562,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.24753531646109855,
      "grad_norm": 0.02180694416165352,
      "kl": 0.017840862274169922,
      "learning_rate": 2.182749191809518e-06,
      "loss": 0.0194,
      "num_tokens": 258411615.0,
      "reward": 0.43769633769989014,
      "reward_std": 0.07489373534917831,
      "rewards/gemini_judge_reward_func/mean": 0.1484375,
      "rewards/gemini_judge_reward_func/std": 0.28412362933158875,
      "rewards/semantic_correctness_reward_func/mean": 0.4616064429283142,
      "rewards/semantic_correctness_reward_func/std": 0.21261639893054962,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 725
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 285.0,
      "completions/mean_length": 161.68304443359375,
      "completions/mean_terminated_length": 141.99542236328125,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.2478767444838035,
      "grad_norm": 0.022032292559742928,
      "kl": 0.015535354614257812,
      "learning_rate": 2.1691044793808734e-06,
      "loss": -0.0172,
      "num_tokens": 258770296.0,
      "reward": 0.4378298223018646,
      "reward_std": 0.07478249073028564,
      "rewards/gemini_judge_reward_func/mean": 0.140625,
      "rewards/gemini_judge_reward_func/std": 0.26635146141052246,
      "rewards/semantic_correctness_reward_func/mean": 0.4556399881839752,
      "rewards/semantic_correctness_reward_func/std": 0.23665642738342285,
      "rewards/xmlcount_reward_func/mean": 0.7261295318603516,
      "rewards/xmlcount_reward_func/std": 0.44144105911254883,
      "step": 726
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 150.37054443359375,
      "completions/mean_terminated_length": 138.51132202148438,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.24821817250650846,
      "grad_norm": 0.022914161905646324,
      "kl": 0.018914222717285156,
      "learning_rate": 2.1554907251720947e-06,
      "loss": 0.0018,
      "num_tokens": 259124663.0,
      "reward": 0.4364605247974396,
      "reward_std": 0.06964661180973053,
      "rewards/gemini_judge_reward_func/mean": 0.1841517835855484,
      "rewards/gemini_judge_reward_func/std": 0.2906714081764221,
      "rewards/semantic_correctness_reward_func/mean": 0.45102569460868835,
      "rewards/semantic_correctness_reward_func/std": 0.2021547555923462,
      "rewards/xmlcount_reward_func/mean": 0.6814866662025452,
      "rewards/xmlcount_reward_func/std": 0.46647319197654724,
      "step": 727
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 749.0,
      "completions/mean_length": 177.90626525878906,
      "completions/mean_terminated_length": 150.61289978027344,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.24855960052921344,
      "grad_norm": 0.019710304215550423,
      "kl": 0.016555309295654297,
      "learning_rate": 2.1419080780610123e-06,
      "loss": -0.0271,
      "num_tokens": 259495574.0,
      "reward": 0.42523664236068726,
      "reward_std": 0.06002560257911682,
      "rewards/gemini_judge_reward_func/mean": 0.1138392835855484,
      "rewards/gemini_judge_reward_func/std": 0.23530438542366028,
      "rewards/semantic_correctness_reward_func/mean": 0.4327545166015625,
      "rewards/semantic_correctness_reward_func/std": 0.2105783075094223,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 728
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 884.0,
      "completions/mean_length": 168.3482208251953,
      "completions/mean_terminated_length": 148.81277465820312,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.2489010285519184,
      "grad_norm": 0.020126277580857277,
      "kl": 0.015465736389160156,
      "learning_rate": 2.1283566865852824e-06,
      "loss": 0.0194,
      "num_tokens": 259854848.0,
      "reward": 0.4410649836063385,
      "reward_std": 0.08376999199390411,
      "rewards/gemini_judge_reward_func/mean": 0.1685267835855484,
      "rewards/gemini_judge_reward_func/std": 0.2991064190864563,
      "rewards/semantic_correctness_reward_func/mean": 0.4465034008026123,
      "rewards/semantic_correctness_reward_func/std": 0.20890836417675018,
      "rewards/xmlcount_reward_func/mean": 0.7108839750289917,
      "rewards/xmlcount_reward_func/std": 0.45479342341423035,
      "step": 729
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 709.0,
      "completions/mean_length": 154.7053680419922,
      "completions/mean_terminated_length": 146.87387084960938,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.24924245657462335,
      "grad_norm": 0.02328014187514782,
      "kl": 0.01580524444580078,
      "learning_rate": 2.11483669894075e-06,
      "loss": 0.0061,
      "num_tokens": 260216594.0,
      "reward": 0.42892396450042725,
      "reward_std": 0.06756206601858139,
      "rewards/gemini_judge_reward_func/mean": 0.1584821492433548,
      "rewards/gemini_judge_reward_func/std": 0.28801846504211426,
      "rewards/semantic_correctness_reward_func/mean": 0.4334053695201874,
      "rewards/semantic_correctness_reward_func/std": 0.23770850896835327,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903594970703,
      "step": 730
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 907.0,
      "completions/mean_length": 156.73214721679688,
      "completions/mean_terminated_length": 144.95928955078125,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.24958388459732833,
      "grad_norm": 0.020609304308891296,
      "kl": 0.01523447036743164,
      "learning_rate": 2.1013482629798334e-06,
      "loss": -0.0081,
      "num_tokens": 260574246.0,
      "reward": 0.3963780105113983,
      "reward_std": 0.05536142736673355,
      "rewards/gemini_judge_reward_func/mean": 0.1060267835855484,
      "rewards/gemini_judge_reward_func/std": 0.23717662692070007,
      "rewards/semantic_correctness_reward_func/mean": 0.4024167060852051,
      "rewards/semantic_correctness_reward_func/std": 0.22327245771884918,
      "rewards/xmlcount_reward_func/mean": 0.6837098002433777,
      "rewards/xmlcount_reward_func/std": 0.46379369497299194,
      "step": 731
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 686.0,
      "completions/mean_length": 161.20982360839844,
      "completions/mean_terminated_length": 153.4369354248047,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.24992531262003329,
      "grad_norm": 0.01949228160083294,
      "kl": 0.014344215393066406,
      "learning_rate": 2.08789152620991e-06,
      "loss": -0.0066,
      "num_tokens": 260925937.0,
      "reward": 0.4333937168121338,
      "reward_std": 0.06455694884061813,
      "rewards/gemini_judge_reward_func/mean": 0.1138392835855484,
      "rewards/gemini_judge_reward_func/std": 0.22927159070968628,
      "rewards/semantic_correctness_reward_func/mean": 0.4036467969417572,
      "rewards/semantic_correctness_reward_func/std": 0.23052960634231567,
      "rewards/xmlcount_reward_func/mean": 0.7678214311599731,
      "rewards/xmlcount_reward_func/std": 0.4232962429523468,
      "step": 732
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 749.0,
      "completions/mean_length": 169.80804443359375,
      "completions/mean_terminated_length": 146.29815673828125,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.25026674064273824,
      "grad_norm": 0.020751619711518288,
      "kl": 0.01793694496154785,
      "learning_rate": 2.0744666357916925e-06,
      "loss": -0.0265,
      "num_tokens": 261278230.0,
      "reward": 0.4312310814857483,
      "reward_std": 0.059306785464286804,
      "rewards/gemini_judge_reward_func/mean": 0.1082589253783226,
      "rewards/gemini_judge_reward_func/std": 0.20976413786411285,
      "rewards/semantic_correctness_reward_func/mean": 0.45834270119667053,
      "rewards/semantic_correctness_reward_func/std": 0.1977355182170868,
      "rewards/xmlcount_reward_func/mean": 0.7406473159790039,
      "rewards/xmlcount_reward_func/std": 0.4352959096431732,
      "step": 733
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 152.18304443359375,
      "completions/mean_terminated_length": 144.32882690429688,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.2506081686654432,
      "grad_norm": 0.020522359758615494,
      "kl": 0.018199682235717773,
      "learning_rate": 2.061073738537635e-06,
      "loss": -0.0267,
      "num_tokens": 261609671.0,
      "reward": 0.4286612272262573,
      "reward_std": 0.07006536424160004,
      "rewards/gemini_judge_reward_func/mean": 0.1138392835855484,
      "rewards/gemini_judge_reward_func/std": 0.238850936293602,
      "rewards/semantic_correctness_reward_func/mean": 0.4498775899410248,
      "rewards/semantic_correctness_reward_func/std": 0.1819140762090683,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 734
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 647.0,
      "completions/mean_length": 163.95982360839844,
      "completions/mean_terminated_length": 144.32418823242188,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.2509495966881482,
      "grad_norm": 0.021382635459303856,
      "kl": 0.01749277114868164,
      "learning_rate": 2.0477129809103147e-06,
      "loss": -0.0024,
      "num_tokens": 261982182.0,
      "reward": 0.45341190695762634,
      "reward_std": 0.07530924677848816,
      "rewards/gemini_judge_reward_func/mean": 0.15625,
      "rewards/gemini_judge_reward_func/std": 0.30342772603034973,
      "rewards/semantic_correctness_reward_func/mean": 0.4799344837665558,
      "rewards/semantic_correctness_reward_func/std": 0.231959268450737,
      "rewards/xmlcount_reward_func/mean": 0.7373126149177551,
      "rewards/xmlcount_reward_func/std": 0.4393693506717682,
      "step": 735
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 523.0,
      "completions/mean_length": 154.74107360839844,
      "completions/mean_terminated_length": 142.94117736816406,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.25129102471085313,
      "grad_norm": 0.0198200773447752,
      "kl": 0.012925148010253906,
      "learning_rate": 2.034384509020837e-06,
      "loss": -0.0073,
      "num_tokens": 262327636.0,
      "reward": 0.43286120891571045,
      "reward_std": 0.05032704025506973,
      "rewards/gemini_judge_reward_func/mean": 0.1049107164144516,
      "rewards/gemini_judge_reward_func/std": 0.19847624003887177,
      "rewards/semantic_correctness_reward_func/mean": 0.4172344207763672,
      "rewards/semantic_correctness_reward_func/std": 0.1993558406829834,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 736
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 362.0,
      "completions/mean_length": 156.1607208251953,
      "completions/mean_terminated_length": 144.38009643554688,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.2516324527335581,
      "grad_norm": 0.021754244342446327,
      "kl": 0.020738601684570312,
      "learning_rate": 2.021088468627237e-06,
      "loss": -0.0388,
      "num_tokens": 262679504.0,
      "reward": 0.4577825963497162,
      "reward_std": 0.04928554967045784,
      "rewards/gemini_judge_reward_func/mean": 0.0982142835855484,
      "rewards/gemini_judge_reward_func/std": 0.21270504593849182,
      "rewards/semantic_correctness_reward_func/mean": 0.41223424673080444,
      "rewards/semantic_correctness_reward_func/std": 0.1733391135931015,
      "rewards/xmlcount_reward_func/mean": 0.8401250839233398,
      "rewards/xmlcount_reward_func/std": 0.3684578835964203,
      "step": 737
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 508.0,
      "completions/mean_length": 153.9241180419922,
      "completions/mean_terminated_length": 138.1045379638672,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.2519738807562631,
      "grad_norm": 0.020852848887443542,
      "kl": 0.01461172103881836,
      "learning_rate": 2.0078250051328783e-06,
      "loss": -0.02,
      "num_tokens": 263058583.0,
      "reward": 0.39713457226753235,
      "reward_std": 0.06134895235300064,
      "rewards/gemini_judge_reward_func/mean": 0.1283482164144516,
      "rewards/gemini_judge_reward_func/std": 0.26361992955207825,
      "rewards/semantic_correctness_reward_func/mean": 0.4419762194156647,
      "rewards/semantic_correctness_reward_func/std": 0.21511034667491913,
      "rewards/xmlcount_reward_func/mean": 0.643500030040741,
      "rewards/xmlcount_reward_func/std": 0.48071080446243286,
      "step": 738
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 531.0,
      "completions/mean_length": 155.4419708251953,
      "completions/mean_terminated_length": 135.6118621826172,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.252315308778968,
      "grad_norm": 0.02940811775624752,
      "kl": 0.023842573165893555,
      "learning_rate": 1.9945942635848745e-06,
      "loss": -0.041,
      "num_tokens": 263423142.0,
      "reward": 0.4132702052593231,
      "reward_std": 0.07020176947116852,
      "rewards/gemini_judge_reward_func/mean": 0.125,
      "rewards/gemini_judge_reward_func/std": 0.2494388222694397,
      "rewards/semantic_correctness_reward_func/mean": 0.45785093307495117,
      "rewards/semantic_correctness_reward_func/std": 0.20504869520664215,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 739
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 986.0,
      "completions/mean_length": 160.79019165039062,
      "completions/mean_terminated_length": 149.07240295410156,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.252656736801673,
      "grad_norm": 0.020677493885159492,
      "kl": 0.015691041946411133,
      "learning_rate": 1.981396388672496e-06,
      "loss": -0.009,
      "num_tokens": 263798431.0,
      "reward": 0.3558245003223419,
      "reward_std": 0.059479519724845886,
      "rewards/gemini_judge_reward_func/mean": 0.0982142835855484,
      "rewards/gemini_judge_reward_func/std": 0.1990930736064911,
      "rewards/semantic_correctness_reward_func/mean": 0.43595272302627563,
      "rewards/semantic_correctness_reward_func/std": 0.1965608447790146,
      "rewards/xmlcount_reward_func/mean": 0.5733705759048462,
      "rewards/xmlcount_reward_func/std": 0.49531227350234985,
      "step": 740
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 389.0,
      "completions/mean_length": 157.1741180419922,
      "completions/mean_terminated_length": 145.40724182128906,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.252998164824378,
      "grad_norm": 0.020069163292646408,
      "kl": 0.015597820281982422,
      "learning_rate": 1.9682315247255897e-06,
      "loss": 0.0015,
      "num_tokens": 264141958.0,
      "reward": 0.46845346689224243,
      "reward_std": 0.07669265568256378,
      "rewards/gemini_judge_reward_func/mean": 0.1428571492433548,
      "rewards/gemini_judge_reward_func/std": 0.23369301855564117,
      "rewards/semantic_correctness_reward_func/mean": 0.44457077980041504,
      "rewards/semantic_correctness_reward_func/std": 0.20611417293548584,
      "rewards/xmlcount_reward_func/mean": 0.8059911727905273,
      "rewards/xmlcount_reward_func/std": 0.3960340917110443,
      "step": 741
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 576.0,
      "completions/max_terminated_length": 576.0,
      "completions/mean_length": 150.99107360839844,
      "completions/mean_terminated_length": 150.99107360839844,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.2533395928470829,
      "grad_norm": 0.02022167295217514,
      "kl": 0.015549659729003906,
      "learning_rate": 1.9550998157129946e-06,
      "loss": -0.0146,
      "num_tokens": 264484752.0,
      "reward": 0.4745713174343109,
      "reward_std": 0.06252977252006531,
      "rewards/gemini_judge_reward_func/mean": 0.1473214328289032,
      "rewards/gemini_judge_reward_func/std": 0.2861793637275696,
      "rewards/semantic_correctness_reward_func/mean": 0.45428499579429626,
      "rewards/semantic_correctness_reward_func/std": 0.24253596365451813,
      "rewards/xmlcount_reward_func/mean": 0.8119643330574036,
      "rewards/xmlcount_reward_func/std": 0.38716888427734375,
      "step": 742
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 415.0,
      "completions/mean_length": 162.91964721679688,
      "completions/mean_terminated_length": 147.2636260986328,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.2536810208697879,
      "grad_norm": 0.02243378572165966,
      "kl": 0.016266345977783203,
      "learning_rate": 1.9420014052409793e-06,
      "loss": -0.0132,
      "num_tokens": 264851810.0,
      "reward": 0.4235195517539978,
      "reward_std": 0.07452236860990524,
      "rewards/gemini_judge_reward_func/mean": 0.1395089328289032,
      "rewards/gemini_judge_reward_func/std": 0.2478865385055542,
      "rewards/semantic_correctness_reward_func/mean": 0.480079710483551,
      "rewards/semantic_correctness_reward_func/std": 0.20092709362506866,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 743
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 412.0,
      "completions/mean_length": 160.6875,
      "completions/mean_terminated_length": 152.909912109375,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.25402244889249287,
      "grad_norm": 0.020303336903452873,
      "kl": 0.013786792755126953,
      "learning_rate": 1.928936436551661e-06,
      "loss": -0.0067,
      "num_tokens": 265200628.0,
      "reward": 0.46757015585899353,
      "reward_std": 0.05975889414548874,
      "rewards/gemini_judge_reward_func/mean": 0.1149553582072258,
      "rewards/gemini_judge_reward_func/std": 0.22932343184947968,
      "rewards/semantic_correctness_reward_func/mean": 0.42321667075157166,
      "rewards/semantic_correctness_reward_func/std": 0.20813030004501343,
      "rewards/xmlcount_reward_func/mean": 0.8423617482185364,
      "rewards/xmlcount_reward_func/std": 0.3648380935192108,
      "step": 744
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 639.0,
      "completions/mean_length": 161.35714721679688,
      "completions/mean_terminated_length": 141.66209411621094,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.2543638769151978,
      "grad_norm": 0.02081400714814663,
      "kl": 0.016047000885009766,
      "learning_rate": 1.915905052521445e-06,
      "loss": -0.0215,
      "num_tokens": 265583028.0,
      "reward": 0.38103312253952026,
      "reward_std": 0.05963090807199478,
      "rewards/gemini_judge_reward_func/mean": 0.1183035746216774,
      "rewards/gemini_judge_reward_func/std": 0.2482219785451889,
      "rewards/semantic_correctness_reward_func/mean": 0.4097905158996582,
      "rewards/semantic_correctness_reward_func/std": 0.22186532616615295,
      "rewards/xmlcount_reward_func/mean": 0.6293839812278748,
      "rewards/xmlcount_reward_func/std": 0.4824991822242737,
      "step": 745
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 926.0,
      "completions/mean_length": 168.90179443359375,
      "completions/mean_terminated_length": 149.3789825439453,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.2547053049379028,
      "grad_norm": 0.020080897957086563,
      "kl": 0.017374038696289062,
      "learning_rate": 1.9029073956594607e-06,
      "loss": -0.0236,
      "num_tokens": 265954738.0,
      "reward": 0.43171700835227966,
      "reward_std": 0.07286177575588226,
      "rewards/gemini_judge_reward_func/mean": 0.1517857164144516,
      "rewards/gemini_judge_reward_func/std": 0.29065632820129395,
      "rewards/semantic_correctness_reward_func/mean": 0.4607636034488678,
      "rewards/semantic_correctness_reward_func/std": 0.23796068131923676,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903594970703,
      "step": 746
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 917.0,
      "completions/mean_length": 152.83482360839844,
      "completions/mean_terminated_length": 148.92825317382812,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.25504673296060776,
      "grad_norm": 0.021075395867228508,
      "kl": 0.01665353775024414,
      "learning_rate": 1.8899436081059974e-06,
      "loss": 0.0303,
      "num_tokens": 266302517.0,
      "reward": 0.48689210414886475,
      "reward_std": 0.07886835187673569,
      "rewards/gemini_judge_reward_func/mean": 0.1964285671710968,
      "rewards/gemini_judge_reward_func/std": 0.3121755123138428,
      "rewards/semantic_correctness_reward_func/mean": 0.5043531060218811,
      "rewards/semantic_correctness_reward_func/std": 0.22672364115715027,
      "rewards/xmlcount_reward_func/mean": 0.7686249613761902,
      "rewards/xmlcount_reward_func/std": 0.42356839776039124,
      "step": 747
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 525.0,
      "completions/mean_length": 158.3482208251953,
      "completions/mean_terminated_length": 134.52293395996094,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.2553881609833127,
      "grad_norm": 0.023336829617619514,
      "kl": 0.02243947982788086,
      "learning_rate": 1.877013831630961e-06,
      "loss": -0.0408,
      "num_tokens": 266703011.0,
      "reward": 0.35559192299842834,
      "reward_std": 0.06644192337989807,
      "rewards/gemini_judge_reward_func/mean": 0.1261160671710968,
      "rewards/gemini_judge_reward_func/std": 0.2571813464164734,
      "rewards/semantic_correctness_reward_func/mean": 0.4174772799015045,
      "rewards/semantic_correctness_reward_func/std": 0.20215776562690735,
      "rewards/xmlcount_reward_func/mean": 0.5541250109672546,
      "rewards/xmlcount_reward_func/std": 0.4987334609031677,
      "step": 748
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 921.0,
      "completions/mean_length": 174.8928680419922,
      "completions/mean_terminated_length": 155.5068359375,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.25572958900601767,
      "grad_norm": 0.01892857439815998,
      "kl": 0.014866113662719727,
      "learning_rate": 1.864118207632315e-06,
      "loss": -0.0094,
      "num_tokens": 267037287.0,
      "reward": 0.4351115822792053,
      "reward_std": 0.062350302934646606,
      "rewards/gemini_judge_reward_func/mean": 0.1082589253783226,
      "rewards/gemini_judge_reward_func/std": 0.19014061987400055,
      "rewards/semantic_correctness_reward_func/mean": 0.4575398862361908,
      "rewards/semantic_correctness_reward_func/std": 0.19340014457702637,
      "rewards/xmlcount_reward_func/mean": 0.7507500648498535,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 749
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 810.0,
      "completions/mean_length": 154.46875,
      "completions/mean_terminated_length": 150.56951904296875,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.25607101702872265,
      "grad_norm": 0.019694700837135315,
      "kl": 0.013186931610107422,
      "learning_rate": 1.851256877134538e-06,
      "loss": -0.0324,
      "num_tokens": 267383892.0,
      "reward": 0.4643140137195587,
      "reward_std": 0.06906662881374359,
      "rewards/gemini_judge_reward_func/mean": 0.1450892835855484,
      "rewards/gemini_judge_reward_func/std": 0.27434879541397095,
      "rewards/semantic_correctness_reward_func/mean": 0.4583911895751953,
      "rewards/semantic_correctness_reward_func/std": 0.20246011018753052,
      "rewards/xmlcount_reward_func/mean": 0.786500096321106,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 750
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0401785714285714,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 557.0,
      "completions/mean_length": 189.77679443359375,
      "completions/mean_terminated_length": 154.85581970214844,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.2564124450514276,
      "grad_norm": 0.018297072499990463,
      "kl": 0.013269424438476562,
      "learning_rate": 1.838429980787081e-06,
      "loss": -0.0136,
      "num_tokens": 267744046.0,
      "reward": 0.4284234344959259,
      "reward_std": 0.06550651043653488,
      "rewards/gemini_judge_reward_func/mean": 0.1283482164144516,
      "rewards/gemini_judge_reward_func/std": 0.23435330390930176,
      "rewards/semantic_correctness_reward_func/mean": 0.4554206430912018,
      "rewards/semantic_correctness_reward_func/std": 0.21077513694763184,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 751
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 695.0,
      "completions/mean_length": 171.38839721679688,
      "completions/mean_terminated_length": 147.92201232910156,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.25675387307413255,
      "grad_norm": 0.019334284588694572,
      "kl": 0.013591766357421875,
      "learning_rate": 1.825637658862824e-06,
      "loss": 0.0009,
      "num_tokens": 268074497.0,
      "reward": 0.42137858271598816,
      "reward_std": 0.05194444581866264,
      "rewards/gemini_judge_reward_func/mean": 0.0948660746216774,
      "rewards/gemini_judge_reward_func/std": 0.20001789927482605,
      "rewards/semantic_correctness_reward_func/mean": 0.41566064953804016,
      "rewards/semantic_correctness_reward_func/std": 0.2019502967596054,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 752
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 851.0,
      "completions/mean_length": 160.01339721679688,
      "completions/mean_terminated_length": 152.229736328125,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.25709530109683754,
      "grad_norm": 0.02137918211519718,
      "kl": 0.021810531616210938,
      "learning_rate": 1.8128800512565514e-06,
      "loss": -0.0168,
      "num_tokens": 268442316.0,
      "reward": 0.4417394697666168,
      "reward_std": 0.061608508229255676,
      "rewards/gemini_judge_reward_func/mean": 0.1183035746216774,
      "rewards/gemini_judge_reward_func/std": 0.20899026095867157,
      "rewards/semantic_correctness_reward_func/mean": 0.43484005331993103,
      "rewards/semantic_correctness_reward_func/std": 0.19988852739334106,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 753
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 283.0,
      "completions/mean_length": 155.6875,
      "completions/mean_terminated_length": 151.79373168945312,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.25743672911954246,
      "grad_norm": 0.020462283864617348,
      "kl": 0.015665769577026367,
      "learning_rate": 1.8001572974834169e-06,
      "loss": 0.0127,
      "num_tokens": 268793298.0,
      "reward": 0.4836767017841339,
      "reward_std": 0.06072762608528137,
      "rewards/gemini_judge_reward_func/mean": 0.1752232164144516,
      "rewards/gemini_judge_reward_func/std": 0.27353349328041077,
      "rewards/semantic_correctness_reward_func/mean": 0.4589102268218994,
      "rewards/semantic_correctness_reward_func/std": 0.1992826759815216,
      "rewards/xmlcount_reward_func/mean": 0.8045134544372559,
      "rewards/xmlcount_reward_func/std": 0.395001083612442,
      "step": 754
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 420.0,
      "completions/mean_length": 173.93304443359375,
      "completions/mean_terminated_length": 146.5115203857422,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.25777815714224744,
      "grad_norm": 0.020110012963414192,
      "kl": 0.014056921005249023,
      "learning_rate": 1.7874695366774191e-06,
      "loss": 0.0103,
      "num_tokens": 269164239.0,
      "reward": 0.4110858738422394,
      "reward_std": 0.0682491660118103,
      "rewards/gemini_judge_reward_func/mean": 0.1372767835855484,
      "rewards/gemini_judge_reward_func/std": 0.27684351801872253,
      "rewards/semantic_correctness_reward_func/mean": 0.44917917251586914,
      "rewards/semantic_correctness_reward_func/std": 0.22507894039154053,
      "rewards/xmlcount_reward_func/mean": 0.6658482551574707,
      "rewards/xmlcount_reward_func/std": 0.4710778594017029,
      "step": 755
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 743.0,
      "completions/mean_length": 155.49554443359375,
      "completions/mean_terminated_length": 151.6009063720703,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.2581195851649524,
      "grad_norm": 0.01925048790872097,
      "kl": 0.014804601669311523,
      "learning_rate": 1.774816907589873e-06,
      "loss": -0.0002,
      "num_tokens": 269511374.0,
      "reward": 0.4355492889881134,
      "reward_std": 0.061426255851984024,
      "rewards/gemini_judge_reward_func/mean": 0.1305803507566452,
      "rewards/gemini_judge_reward_func/std": 0.249376118183136,
      "rewards/semantic_correctness_reward_func/mean": 0.4709513783454895,
      "rewards/semantic_correctness_reward_func/std": 0.2258910834789276,
      "rewards/xmlcount_reward_func/mean": 0.7228170037269592,
      "rewards/xmlcount_reward_func/std": 0.4484759569168091,
      "step": 756
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 770.0,
      "completions/mean_length": 162.03125,
      "completions/mean_terminated_length": 154.26576232910156,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.25846101318765735,
      "grad_norm": 0.020978957414627075,
      "kl": 0.014691352844238281,
      "learning_rate": 1.7621995485879062e-06,
      "loss": -0.0172,
      "num_tokens": 269879709.0,
      "reward": 0.417973130941391,
      "reward_std": 0.06662485003471375,
      "rewards/gemini_judge_reward_func/mean": 0.1104910746216774,
      "rewards/gemini_judge_reward_func/std": 0.2327246069908142,
      "rewards/semantic_correctness_reward_func/mean": 0.4434188902378082,
      "rewards/semantic_correctness_reward_func/std": 0.18930912017822266,
      "rewards/xmlcount_reward_func/mean": 0.712732195854187,
      "rewards/xmlcount_reward_func/std": 0.4530121684074402,
      "step": 757
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 394.0,
      "completions/mean_length": 157.69644165039062,
      "completions/mean_terminated_length": 149.8918914794922,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "epoch": 0.25880244121036233,
      "grad_norm": 0.020836396142840385,
      "kl": 0.012416601181030273,
      "learning_rate": 1.749617597652934e-06,
      "loss": 0.0155,
      "num_tokens": 270216073.0,
      "reward": 0.45066124200820923,
      "reward_std": 0.06927520036697388,
      "rewards/gemini_judge_reward_func/mean": 0.109375,
      "rewards/gemini_judge_reward_func/std": 0.22280631959438324,
      "rewards/semantic_correctness_reward_func/mean": 0.4258059859275818,
      "rewards/semantic_correctness_reward_func/std": 0.1996365785598755,
      "rewards/xmlcount_reward_func/mean": 0.8043750524520874,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 758
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 323.0,
      "completions/mean_length": 159.52679443359375,
      "completions/mean_terminated_length": 147.79185485839844,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.2591438692330673,
      "grad_norm": 0.02067898027598858,
      "kl": 0.013218402862548828,
      "learning_rate": 1.7370711923791567e-06,
      "loss": -0.0102,
      "num_tokens": 270580307.0,
      "reward": 0.4589332044124603,
      "reward_std": 0.06333411484956741,
      "rewards/gemini_judge_reward_func/mean": 0.1484375,
      "rewards/gemini_judge_reward_func/std": 0.2761194407939911,
      "rewards/semantic_correctness_reward_func/mean": 0.4605408012866974,
      "rewards/semantic_correctness_reward_func/std": 0.22162102162837982,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 759
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1023.0,
      "completions/mean_length": 180.91964721679688,
      "completions/mean_terminated_length": 153.72349548339844,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.25948529725577224,
      "grad_norm": 0.0202178992331028,
      "kl": 0.013530254364013672,
      "learning_rate": 1.7245604699720536e-06,
      "loss": -0.025,
      "num_tokens": 270939521.0,
      "reward": 0.42256343364715576,
      "reward_std": 0.06891320645809174,
      "rewards/gemini_judge_reward_func/mean": 0.1696428507566452,
      "rewards/gemini_judge_reward_func/std": 0.29326191544532776,
      "rewards/semantic_correctness_reward_func/mean": 0.438263475894928,
      "rewards/semantic_correctness_reward_func/std": 0.20034128427505493,
      "rewards/xmlcount_reward_func/mean": 0.6676340103149414,
      "rewards/xmlcount_reward_func/std": 0.4686497449874878,
      "step": 760
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 951.0,
      "completions/mean_length": 157.61607360839844,
      "completions/mean_terminated_length": 149.81082153320312,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.2598267252784772,
      "grad_norm": 0.019846944138407707,
      "kl": 0.01799154281616211,
      "learning_rate": 1.7120855672468779e-06,
      "loss": 0.0041,
      "num_tokens": 271284047.0,
      "reward": 0.4936152398586273,
      "reward_std": 0.06837694346904755,
      "rewards/gemini_judge_reward_func/mean": 0.1997767835855484,
      "rewards/gemini_judge_reward_func/std": 0.29091235995292664,
      "rewards/semantic_correctness_reward_func/mean": 0.4955224394798279,
      "rewards/semantic_correctness_reward_func/std": 0.2096569538116455,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 761
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 489.0,
      "completions/mean_length": 162.7544708251953,
      "completions/mean_terminated_length": 151.0633544921875,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.2601681533011822,
      "grad_norm": 0.020178409293293953,
      "kl": 0.01193547248840332,
      "learning_rate": 1.6996466206271679e-06,
      "loss": 0.0086,
      "num_tokens": 271631348.0,
      "reward": 0.47532185912132263,
      "reward_std": 0.07062742859125137,
      "rewards/gemini_judge_reward_func/mean": 0.1238839253783226,
      "rewards/gemini_judge_reward_func/std": 0.2460424154996872,
      "rewards/semantic_correctness_reward_func/mean": 0.47319841384887695,
      "rewards/semantic_correctness_reward_func/std": 0.21414029598236084,
      "rewards/xmlcount_reward_func/mean": 0.8278214335441589,
      "rewards/xmlcount_reward_func/std": 0.3753868639469147,
      "step": 762
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 618.0,
      "completions/mean_length": 170.85269165039062,
      "completions/mean_terminated_length": 147.37155151367188,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.2605095813238872,
      "grad_norm": 0.01988663524389267,
      "kl": 0.013807296752929688,
      "learning_rate": 1.6872437661432518e-06,
      "loss": -0.036,
      "num_tokens": 271988783.0,
      "reward": 0.43647101521492004,
      "reward_std": 0.059993524104356766,
      "rewards/gemini_judge_reward_func/mean": 0.1127232164144516,
      "rewards/gemini_judge_reward_func/std": 0.23524853587150574,
      "rewards/semantic_correctness_reward_func/mean": 0.46438151597976685,
      "rewards/semantic_correctness_reward_func/std": 0.2084624022245407,
      "rewards/xmlcount_reward_func/mean": 0.7462633848190308,
      "rewards/xmlcount_reward_func/std": 0.4369613230228424,
      "step": 763
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 705.0,
      "completions/max_terminated_length": 705.0,
      "completions/mean_length": 157.19644165039062,
      "completions/mean_terminated_length": 157.19644165039062,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.2608510093465921,
      "grad_norm": 0.01950724609196186,
      "kl": 0.012348175048828125,
      "learning_rate": 1.6748771394307584e-06,
      "loss": -0.0017,
      "num_tokens": 272317423.0,
      "reward": 0.43308770656585693,
      "reward_std": 0.04825283959507942,
      "rewards/gemini_judge_reward_func/mean": 0.0970982164144516,
      "rewards/gemini_judge_reward_func/std": 0.24214671552181244,
      "rewards/semantic_correctness_reward_func/mean": 0.3988582193851471,
      "rewards/semantic_correctness_reward_func/std": 0.19523859024047852,
      "rewards/xmlcount_reward_func/mean": 0.7861920595169067,
      "rewards/xmlcount_reward_func/std": 0.4122726619243622,
      "step": 764
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 383.0,
      "completions/mean_length": 164.9732208251953,
      "completions/mean_terminated_length": 149.3545379638672,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.2611924373692971,
      "grad_norm": 0.020313076674938202,
      "kl": 0.01384592056274414,
      "learning_rate": 1.6625468757291379e-06,
      "loss": -0.0154,
      "num_tokens": 272666877.0,
      "reward": 0.4314127266407013,
      "reward_std": 0.06205175817012787,
      "rewards/gemini_judge_reward_func/mean": 0.1015625,
      "rewards/gemini_judge_reward_func/std": 0.18942859768867493,
      "rewards/semantic_correctness_reward_func/mean": 0.41668835282325745,
      "rewards/semantic_correctness_reward_func/std": 0.1859598010778427,
      "rewards/xmlcount_reward_func/mean": 0.7686249613761902,
      "rewards/xmlcount_reward_func/std": 0.42356839776039124,
      "step": 765
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 338.0,
      "completions/mean_length": 171.37054443359375,
      "completions/mean_terminated_length": 143.86636352539062,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.2615338653920021,
      "grad_norm": 0.020971033722162247,
      "kl": 0.013844966888427734,
      "learning_rate": 1.6502531098801756e-06,
      "loss": -0.019,
      "num_tokens": 273015940.0,
      "reward": 0.42292889952659607,
      "reward_std": 0.0625869482755661,
      "rewards/gemini_judge_reward_func/mean": 0.1116071417927742,
      "rewards/gemini_judge_reward_func/std": 0.22295227646827698,
      "rewards/semantic_correctness_reward_func/mean": 0.4241444170475006,
      "rewards/semantic_correctness_reward_func/std": 0.1952347457408905,
      "rewards/xmlcount_reward_func/mean": 0.7336428761482239,
      "rewards/xmlcount_reward_func/std": 0.49561816453933716,
      "step": 766
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 894.0,
      "completions/mean_length": 163.9419708251953,
      "completions/mean_terminated_length": 152.2669677734375,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.261875293414707,
      "grad_norm": 0.02085995301604271,
      "kl": 0.013644695281982422,
      "learning_rate": 1.6379959763265268e-06,
      "loss": -0.0253,
      "num_tokens": 273385231.0,
      "reward": 0.40597787499427795,
      "reward_std": 0.05839291214942932,
      "rewards/gemini_judge_reward_func/mean": 0.1272321492433548,
      "rewards/gemini_judge_reward_func/std": 0.24942879378795624,
      "rewards/semantic_correctness_reward_func/mean": 0.4526750147342682,
      "rewards/semantic_correctness_reward_func/std": 0.18940961360931396,
      "rewards/xmlcount_reward_func/mean": 0.6613750457763672,
      "rewards/xmlcount_reward_func/std": 0.47500187158584595,
      "step": 767
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 492.0,
      "completions/mean_length": 164.46429443359375,
      "completions/mean_terminated_length": 152.79638671875,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.262216721437412,
      "grad_norm": 0.019793834537267685,
      "kl": 0.012224435806274414,
      "learning_rate": 1.62577560911024e-06,
      "loss": -0.0217,
      "num_tokens": 273730931.0,
      "reward": 0.46663984656333923,
      "reward_std": 0.07231976836919785,
      "rewards/gemini_judge_reward_func/mean": 0.1383928507566452,
      "rewards/gemini_judge_reward_func/std": 0.23873566091060638,
      "rewards/semantic_correctness_reward_func/mean": 0.43424367904663086,
      "rewards/semantic_correctness_reward_func/std": 0.20122456550598145,
      "rewards/xmlcount_reward_func/mean": 0.8110848665237427,
      "rewards/xmlcount_reward_func/std": 0.3890477418899536,
      "step": 768
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 429.0,
      "completions/mean_length": 166.2053680419922,
      "completions/mean_terminated_length": 142.59632873535156,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.26255814946011696,
      "grad_norm": 0.019900793209671974,
      "kl": 0.012137651443481445,
      "learning_rate": 1.6135921418712959e-06,
      "loss": -0.0266,
      "num_tokens": 274109337.0,
      "reward": 0.4262751340866089,
      "reward_std": 0.06663599610328674,
      "rewards/gemini_judge_reward_func/mean": 0.1283482164144516,
      "rewards/gemini_judge_reward_func/std": 0.2549728751182556,
      "rewards/semantic_correctness_reward_func/mean": 0.4357326328754425,
      "rewards/semantic_correctness_reward_func/std": 0.221801295876503,
      "rewards/xmlcount_reward_func/mean": 0.7194732427597046,
      "rewards/xmlcount_reward_func/std": 0.44856733083724976,
      "step": 769
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 696.0,
      "completions/mean_length": 165.85269165039062,
      "completions/mean_terminated_length": 150.25,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.2628995774828219,
      "grad_norm": 0.020068790763616562,
      "kl": 0.011019706726074219,
      "learning_rate": 1.6014457078461354e-06,
      "loss": -0.004,
      "num_tokens": 274433136.0,
      "reward": 0.49018624424934387,
      "reward_std": 0.05191074684262276,
      "rewards/gemini_judge_reward_func/mean": 0.1395089328289032,
      "rewards/gemini_judge_reward_func/std": 0.25894618034362793,
      "rewards/semantic_correctness_reward_func/mean": 0.420163094997406,
      "rewards/semantic_correctness_reward_func/std": 0.21702919900417328,
      "rewards/xmlcount_reward_func/mean": 0.8758750557899475,
      "rewards/xmlcount_reward_func/std": 0.33179107308387756,
      "step": 770
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 619.0,
      "completions/mean_length": 151.52232360839844,
      "completions/mean_terminated_length": 139.67874145507812,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.26324100550552687,
      "grad_norm": 0.02097266912460327,
      "kl": 0.02287602424621582,
      "learning_rate": 1.5893364398662175e-06,
      "loss": -0.0015,
      "num_tokens": 274817569.0,
      "reward": 0.4012994170188904,
      "reward_std": 0.06106601655483246,
      "rewards/gemini_judge_reward_func/mean": 0.1160714253783226,
      "rewards/gemini_judge_reward_func/std": 0.25483787059783936,
      "rewards/semantic_correctness_reward_func/mean": 0.40730950236320496,
      "rewards/semantic_correctness_reward_func/std": 0.2029609978199005,
      "rewards/xmlcount_reward_func/mean": 0.6835223436355591,
      "rewards/xmlcount_reward_func/std": 0.4653799831867218,
      "step": 771
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 862.0,
      "completions/mean_length": 162.83929443359375,
      "completions/mean_terminated_length": 151.14932250976562,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.26358243352823185,
      "grad_norm": 0.02049219235777855,
      "kl": 0.015248298645019531,
      "learning_rate": 1.5772644703565564e-06,
      "loss": -0.0374,
      "num_tokens": 275173829.0,
      "reward": 0.45400094985961914,
      "reward_std": 0.07043319195508957,
      "rewards/gemini_judge_reward_func/mean": 0.1328125,
      "rewards/gemini_judge_reward_func/std": 0.23898446559906006,
      "rewards/semantic_correctness_reward_func/mean": 0.4313795268535614,
      "rewards/semantic_correctness_reward_func/std": 0.1887551099061966,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 772
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 922.0,
      "completions/mean_length": 173.20089721679688,
      "completions/mean_terminated_length": 157.7318115234375,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.2639238615509368,
      "grad_norm": 0.020059634000062943,
      "kl": 0.013273954391479492,
      "learning_rate": 1.5652299313342772e-06,
      "loss": -0.0095,
      "num_tokens": 275523618.0,
      "reward": 0.4348951280117035,
      "reward_std": 0.06464679539203644,
      "rewards/gemini_judge_reward_func/mean": 0.1305803507566452,
      "rewards/gemini_judge_reward_func/std": 0.2571229338645935,
      "rewards/semantic_correctness_reward_func/mean": 0.4475648105144501,
      "rewards/semantic_correctness_reward_func/std": 0.2125941663980484,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 773
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.0,
      "completions/mean_length": 163.45089721679688,
      "completions/mean_terminated_length": 155.6981964111328,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.26426528957364176,
      "grad_norm": 0.020751817151904106,
      "kl": 0.01684427261352539,
      "learning_rate": 1.5532329544071712e-06,
      "loss": 0.0093,
      "num_tokens": 275892547.0,
      "reward": 0.41509416699409485,
      "reward_std": 0.05522005259990692,
      "rewards/gemini_judge_reward_func/mean": 0.109375,
      "rewards/gemini_judge_reward_func/std": 0.22406068444252014,
      "rewards/semantic_correctness_reward_func/mean": 0.4133010506629944,
      "rewards/semantic_correctness_reward_func/std": 0.20829689502716064,
      "rewards/xmlcount_reward_func/mean": 0.7217098474502563,
      "rewards/xmlcount_reward_func/std": 0.4462066888809204,
      "step": 774
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 272.0,
      "completions/mean_length": 147.7678680419922,
      "completions/mean_terminated_length": 143.8385772705078,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.26460671759634674,
      "grad_norm": 0.018952973186969757,
      "kl": 0.013130664825439453,
      "learning_rate": 1.5412736707722537e-06,
      "loss": -0.0097,
      "num_tokens": 276235711.0,
      "reward": 0.4852498769760132,
      "reward_std": 0.053085166960954666,
      "rewards/gemini_judge_reward_func/mean": 0.1919642835855484,
      "rewards/gemini_judge_reward_func/std": 0.2860393822193146,
      "rewards/semantic_correctness_reward_func/mean": 0.48837438225746155,
      "rewards/semantic_correctness_reward_func/std": 0.2230820506811142,
      "rewards/xmlcount_reward_func/mean": 0.7769732475280762,
      "rewards/xmlcount_reward_func/std": 0.4148280918598175,
      "step": 775
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 182.04019165039062,
      "completions/mean_terminated_length": 158.86695861816406,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.26494814561905167,
      "grad_norm": 0.02046089619398117,
      "kl": 0.0116729736328125,
      "learning_rate": 1.5293522112143371e-06,
      "loss": -0.0221,
      "num_tokens": 276577924.0,
      "reward": 0.40735191106796265,
      "reward_std": 0.0653359517455101,
      "rewards/gemini_judge_reward_func/mean": 0.1171875,
      "rewards/gemini_judge_reward_func/std": 0.21555359661579132,
      "rewards/semantic_correctness_reward_func/mean": 0.4438844621181488,
      "rewards/semantic_correctness_reward_func/std": 0.2117227166891098,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 776
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 461.0,
      "completions/mean_length": 163.1294708251953,
      "completions/mean_terminated_length": 147.47726440429688,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.26528957364175665,
      "grad_norm": 0.019794687628746033,
      "kl": 0.01212763786315918,
      "learning_rate": 1.517468706104589e-06,
      "loss": 0.0108,
      "num_tokens": 276940561.0,
      "reward": 0.42630788683891296,
      "reward_std": 0.061067983508110046,
      "rewards/gemini_judge_reward_func/mean": 0.1004464253783226,
      "rewards/gemini_judge_reward_func/std": 0.20355534553527832,
      "rewards/semantic_correctness_reward_func/mean": 0.4291464686393738,
      "rewards/semantic_correctness_reward_func/std": 0.20234155654907227,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 777
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1002.0,
      "completions/mean_length": 177.58929443359375,
      "completions/mean_terminated_length": 166.09954833984375,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.26563100166446163,
      "grad_norm": 0.021563053131103516,
      "kl": 0.020466327667236328,
      "learning_rate": 1.505623285399121e-06,
      "loss": -0.0163,
      "num_tokens": 277263169.0,
      "reward": 0.45694440603256226,
      "reward_std": 0.06528093665838242,
      "rewards/gemini_judge_reward_func/mean": 0.1060267835855484,
      "rewards/gemini_judge_reward_func/std": 0.23360466957092285,
      "rewards/semantic_correctness_reward_func/mean": 0.3925253450870514,
      "rewards/semantic_correctness_reward_func/std": 0.21922528743743896,
      "rewards/xmlcount_reward_func/mean": 0.8400714993476868,
      "rewards/xmlcount_reward_func/std": 0.3684346675872803,
      "step": 778
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.044642857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 706.0,
      "completions/mean_length": 186.77232360839844,
      "completions/mean_terminated_length": 147.64952087402344,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.26597242968716656,
      "grad_norm": 0.020064886659383774,
      "kl": 0.015275955200195312,
      "learning_rate": 1.4938160786375571e-06,
      "loss": -0.02,
      "num_tokens": 277623346.0,
      "reward": 0.44759249687194824,
      "reward_std": 0.07056890428066254,
      "rewards/gemini_judge_reward_func/mean": 0.1595982164144516,
      "rewards/gemini_judge_reward_func/std": 0.2799893319606781,
      "rewards/semantic_correctness_reward_func/mean": 0.4585872292518616,
      "rewards/semantic_correctness_reward_func/std": 0.2106294482946396,
      "rewards/xmlcount_reward_func/mean": 0.7300893068313599,
      "rewards/xmlcount_reward_func/std": 0.43690332770347595,
      "step": 779
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 732.0,
      "completions/mean_length": 153.2678680419922,
      "completions/mean_terminated_length": 145.42343139648438,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.26631385770987154,
      "grad_norm": 0.020851830020546913,
      "kl": 0.015101909637451172,
      "learning_rate": 1.4820472149416153e-06,
      "loss": 0.0062,
      "num_tokens": 277983922.0,
      "reward": 0.42797625064849854,
      "reward_std": 0.06007370352745056,
      "rewards/gemini_judge_reward_func/mean": 0.1651785671710968,
      "rewards/gemini_judge_reward_func/std": 0.2594698965549469,
      "rewards/semantic_correctness_reward_func/mean": 0.47787216305732727,
      "rewards/semantic_correctness_reward_func/std": 0.1910025030374527,
      "rewards/xmlcount_reward_func/mean": 0.6658259034156799,
      "rewards/xmlcount_reward_func/std": 0.4710857570171356,
      "step": 780
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 325.0,
      "completions/mean_length": 156.3794708251953,
      "completions/mean_terminated_length": 140.6045379638672,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.2666552857325765,
      "grad_norm": 0.02058962918817997,
      "kl": 0.014112472534179688,
      "learning_rate": 1.4703168230137072e-06,
      "loss": -0.01,
      "num_tokens": 278356979.0,
      "reward": 0.39631539583206177,
      "reward_std": 0.06510846316814423,
      "rewards/gemini_judge_reward_func/mean": 0.109375,
      "rewards/gemini_judge_reward_func/std": 0.21381975710391998,
      "rewards/semantic_correctness_reward_func/mean": 0.4400768578052521,
      "rewards/semantic_correctness_reward_func/std": 0.1941695511341095,
      "rewards/xmlcount_reward_func/mean": 0.6613750457763672,
      "rewards/xmlcount_reward_func/std": 0.47500187158584595,
      "step": 781
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 811.0,
      "completions/mean_length": 152.14732360839844,
      "completions/mean_terminated_length": 144.2928009033203,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.26699671375528145,
      "grad_norm": 0.02044842764735222,
      "kl": 0.013594627380371094,
      "learning_rate": 1.4586250311355132e-06,
      "loss": -0.0371,
      "num_tokens": 278714588.0,
      "reward": 0.44066599011421204,
      "reward_std": 0.055519502609968185,
      "rewards/gemini_judge_reward_func/mean": 0.1294642835855484,
      "rewards/gemini_judge_reward_func/std": 0.23433461785316467,
      "rewards/semantic_correctness_reward_func/mean": 0.4429011344909668,
      "rewards/semantic_correctness_reward_func/std": 0.19725266098976135,
      "rewards/xmlcount_reward_func/mean": 0.7507500648498535,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 782
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 670.0,
      "completions/mean_length": 166.03125,
      "completions/mean_terminated_length": 146.4429168701172,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.2673381417779864,
      "grad_norm": 0.0210841353982687,
      "kl": 0.019263029098510742,
      "learning_rate": 1.4469719671666043e-06,
      "loss": -0.0098,
      "num_tokens": 279071023.0,
      "reward": 0.4465175271034241,
      "reward_std": 0.0766262337565422,
      "rewards/gemini_judge_reward_func/mean": 0.140625,
      "rewards/gemini_judge_reward_func/std": 0.24668484926223755,
      "rewards/semantic_correctness_reward_func/mean": 0.44983741641044617,
      "rewards/semantic_correctness_reward_func/std": 0.2102912813425064,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 783
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 321.0,
      "completions/max_terminated_length": 321.0,
      "completions/mean_length": 149.70089721679688,
      "completions/mean_terminated_length": 149.70089721679688,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "epoch": 0.2676795698006914,
      "grad_norm": 0.020219266414642334,
      "kl": 0.012418746948242188,
      "learning_rate": 1.4353577585430152e-06,
      "loss": -0.0121,
      "num_tokens": 279425912.0,
      "reward": 0.46571969985961914,
      "reward_std": 0.0656966120004654,
      "rewards/gemini_judge_reward_func/mean": 0.1138392835855484,
      "rewards/gemini_judge_reward_func/std": 0.20197565853595734,
      "rewards/semantic_correctness_reward_func/mean": 0.443071573972702,
      "rewards/semantic_correctness_reward_func/std": 0.1867143213748932,
      "rewards/xmlcount_reward_func/mean": 0.8289241790771484,
      "rewards/xmlcount_reward_func/std": 0.3740423023700714,
      "step": 784
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 746.0,
      "completions/mean_length": 163.125,
      "completions/mean_terminated_length": 159.26458740234375,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.26802099782339633,
      "grad_norm": 0.023210877552628517,
      "kl": 0.014898300170898438,
      "learning_rate": 1.4237825322758735e-06,
      "loss": -0.0172,
      "num_tokens": 279769636.0,
      "reward": 0.4273902475833893,
      "reward_std": 0.04811634495854378,
      "rewards/gemini_judge_reward_func/mean": 0.0747767835855484,
      "rewards/gemini_judge_reward_func/std": 0.157900869846344,
      "rewards/semantic_correctness_reward_func/mean": 0.4377457797527313,
      "rewards/semantic_correctness_reward_func/std": 0.17589552700519562,
      "rewards/xmlcount_reward_func/mean": 0.774825930595398,
      "rewards/xmlcount_reward_func/std": 0.41559502482414246,
      "step": 785
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 903.0,
      "completions/mean_length": 167.7991180419922,
      "completions/mean_terminated_length": 156.17648315429688,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.2683624258461013,
      "grad_norm": 0.022142644971609116,
      "kl": 0.01622152328491211,
      "learning_rate": 1.412246414949997e-06,
      "loss": -0.0116,
      "num_tokens": 280144391.0,
      "reward": 0.41873544454574585,
      "reward_std": 0.05372651666402817,
      "rewards/gemini_judge_reward_func/mean": 0.1171875,
      "rewards/gemini_judge_reward_func/std": 0.22571587562561035,
      "rewards/semantic_correctness_reward_func/mean": 0.44717708230018616,
      "rewards/semantic_correctness_reward_func/std": 0.21336087584495544,
      "rewards/xmlcount_reward_func/mean": 0.7060624957084656,
      "rewards/xmlcount_reward_func/std": 0.4573599696159363,
      "step": 786
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.044642857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 939.0,
      "completions/mean_length": 190.55357360839844,
      "completions/mean_terminated_length": 151.6074676513672,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.2687038538688063,
      "grad_norm": 0.027027791365981102,
      "kl": 0.01961684226989746,
      "learning_rate": 1.4007495327225162e-06,
      "loss": 0.0185,
      "num_tokens": 280532283.0,
      "reward": 0.40803104639053345,
      "reward_std": 0.07797716557979584,
      "rewards/gemini_judge_reward_func/mean": 0.1261160671710968,
      "rewards/gemini_judge_reward_func/std": 0.23675422370433807,
      "rewards/semantic_correctness_reward_func/mean": 0.42274436354637146,
      "rewards/semantic_correctness_reward_func/std": 0.20996291935443878,
      "rewards/xmlcount_reward_func/mean": 0.6825892329216003,
      "rewards/xmlcount_reward_func/std": 0.46256276965141296,
      "step": 787
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 481.0,
      "completions/mean_length": 165.3616180419922,
      "completions/mean_terminated_length": 149.75,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.2690452818915112,
      "grad_norm": 0.01970202848315239,
      "kl": 0.015131473541259766,
      "learning_rate": 1.389292011321498e-06,
      "loss": -0.026,
      "num_tokens": 280909580.0,
      "reward": 0.42601242661476135,
      "reward_std": 0.07185456156730652,
      "rewards/gemini_judge_reward_func/mean": 0.1395089328289032,
      "rewards/gemini_judge_reward_func/std": 0.26323994994163513,
      "rewards/semantic_correctness_reward_func/mean": 0.45679420232772827,
      "rewards/semantic_correctness_reward_func/std": 0.212924987077713,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903892993927,
      "step": 788
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 606.0,
      "completions/mean_length": 161.85714721679688,
      "completions/mean_terminated_length": 146.1818084716797,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.2693867099142162,
      "grad_norm": 0.022267106920480728,
      "kl": 0.013372421264648438,
      "learning_rate": 1.3778739760445552e-06,
      "loss": -0.0111,
      "num_tokens": 281254012.0,
      "reward": 0.4726380705833435,
      "reward_std": 0.0705428421497345,
      "rewards/gemini_judge_reward_func/mean": 0.1607142835855484,
      "rewards/gemini_judge_reward_func/std": 0.27581340074539185,
      "rewards/semantic_correctness_reward_func/mean": 0.4330117106437683,
      "rewards/semantic_correctness_reward_func/std": 0.22527293860912323,
      "rewards/xmlcount_reward_func/mean": 0.8043751120567322,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 789
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 826.0,
      "completions/mean_length": 157.98214721679688,
      "completions/mean_terminated_length": 146.22625732421875,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.2697281379369212,
      "grad_norm": 0.021097326651215553,
      "kl": 0.015774250030517578,
      "learning_rate": 1.3664955517574967e-06,
      "loss": -0.0176,
      "num_tokens": 281637792.0,
      "reward": 0.4128992259502411,
      "reward_std": 0.06964334845542908,
      "rewards/gemini_judge_reward_func/mean": 0.1495535671710968,
      "rewards/gemini_judge_reward_func/std": 0.26249030232429504,
      "rewards/semantic_correctness_reward_func/mean": 0.4426388740539551,
      "rewards/semantic_correctness_reward_func/std": 0.21492232382297516,
      "rewards/xmlcount_reward_func/mean": 0.6613750457763672,
      "rewards/xmlcount_reward_func/std": 0.47500187158584595,
      "step": 790
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 887.0,
      "completions/mean_length": 168.2366180419922,
      "completions/mean_terminated_length": 160.5270233154297,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.2700695659596261,
      "grad_norm": 0.018702613189816475,
      "kl": 0.011590957641601562,
      "learning_rate": 1.3551568628929434e-06,
      "loss": 0.0018,
      "num_tokens": 281995737.0,
      "reward": 0.4427972733974457,
      "reward_std": 0.05938927084207535,
      "rewards/gemini_judge_reward_func/mean": 0.0870535746216774,
      "rewards/gemini_judge_reward_func/std": 0.16811345517635345,
      "rewards/semantic_correctness_reward_func/mean": 0.42219167947769165,
      "rewards/semantic_correctness_reward_func/std": 0.18814805150032043,
      "rewards/xmlcount_reward_func/mean": 0.8088437914848328,
      "rewards/xmlcount_reward_func/std": 0.3951219618320465,
      "step": 791
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 322.0,
      "completions/mean_length": 167.6875,
      "completions/mean_terminated_length": 144.1192626953125,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.2704109939823311,
      "grad_norm": 0.020110948011279106,
      "kl": 0.013335943222045898,
      "learning_rate": 1.343858033448982e-06,
      "loss": -0.037,
      "num_tokens": 282359479.0,
      "reward": 0.4621303975582123,
      "reward_std": 0.06999564170837402,
      "rewards/gemini_judge_reward_func/mean": 0.1517857164144516,
      "rewards/gemini_judge_reward_func/std": 0.27984848618507385,
      "rewards/semantic_correctness_reward_func/mean": 0.4340803325176239,
      "rewards/semantic_correctness_reward_func/std": 0.2148793637752533,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 792
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 320.0,
      "completions/max_terminated_length": 320.0,
      "completions/mean_length": 144.5803680419922,
      "completions/mean_terminated_length": 144.5803680419922,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.2707524220050361,
      "grad_norm": 0.020432932302355766,
      "kl": 0.01767253875732422,
      "learning_rate": 1.3325991869878013e-06,
      "loss": 0.0123,
      "num_tokens": 282724829.0,
      "reward": 0.4063700735569,
      "reward_std": 0.06738902628421783,
      "rewards/gemini_judge_reward_func/mean": 0.1149553582072258,
      "rewards/gemini_judge_reward_func/std": 0.24007098376750946,
      "rewards/semantic_correctness_reward_func/mean": 0.40768957138061523,
      "rewards/semantic_correctness_reward_func/std": 0.21261626482009888,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903892993927,
      "step": 793
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 321.0,
      "completions/mean_length": 149.48214721679688,
      "completions/mean_terminated_length": 145.560546875,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.271093850027741,
      "grad_norm": 0.02059169113636017,
      "kl": 0.016257762908935547,
      "learning_rate": 1.321380446634342e-06,
      "loss": -0.0034,
      "num_tokens": 283067757.0,
      "reward": 0.42399582266807556,
      "reward_std": 0.06734077632427216,
      "rewards/gemini_judge_reward_func/mean": 0.1060267835855484,
      "rewards/gemini_judge_reward_func/std": 0.235991969704628,
      "rewards/semantic_correctness_reward_func/mean": 0.41760390996932983,
      "rewards/semantic_correctness_reward_func/std": 0.21390259265899658,
      "rewards/xmlcount_reward_func/mean": 0.7451607584953308,
      "rewards/xmlcount_reward_func/std": 0.4366372525691986,
      "step": 794
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0401785714285714,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 408.0,
      "completions/mean_length": 183.84376525878906,
      "completions/mean_terminated_length": 148.67442321777344,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.271435278050446,
      "grad_norm": 0.018847793340682983,
      "kl": 0.012856006622314453,
      "learning_rate": 1.3102019350749528e-06,
      "loss": -0.0197,
      "num_tokens": 283461510.0,
      "reward": 0.3889876902103424,
      "reward_std": 0.06528311222791672,
      "rewards/gemini_judge_reward_func/mean": 0.1305803507566452,
      "rewards/gemini_judge_reward_func/std": 0.2516138255596161,
      "rewards/semantic_correctness_reward_func/mean": 0.4201793968677521,
      "rewards/semantic_correctness_reward_func/std": 0.1820724755525589,
      "rewards/xmlcount_reward_func/mean": 0.6317991614341736,
      "rewards/xmlcount_reward_func/std": 0.48062554001808167,
      "step": 795
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 445.0,
      "completions/mean_length": 156.80804443359375,
      "completions/mean_terminated_length": 148.99549865722656,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.27177670607315096,
      "grad_norm": 0.020621461793780327,
      "kl": 0.012531042098999023,
      "learning_rate": 1.2990637745560418e-06,
      "loss": 0.0117,
      "num_tokens": 283817627.0,
      "reward": 0.46992671489715576,
      "reward_std": 0.08015181124210358,
      "rewards/gemini_judge_reward_func/mean": 0.1517857164144516,
      "rewards/gemini_judge_reward_func/std": 0.27068495750427246,
      "rewards/semantic_correctness_reward_func/mean": 0.455187052488327,
      "rewards/semantic_correctness_reward_func/std": 0.20298680663108826,
      "rewards/xmlcount_reward_func/mean": 0.7954375147819519,
      "rewards/xmlcount_reward_func/std": 0.405271977186203,
      "step": 796
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 911.0,
      "completions/mean_length": 148.75894165039062,
      "completions/mean_terminated_length": 144.83409118652344,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.2721181340958559,
      "grad_norm": 0.020870037376880646,
      "kl": 0.014215469360351562,
      "learning_rate": 1.2879660868827508e-06,
      "loss": -0.03,
      "num_tokens": 284171409.0,
      "reward": 0.40534916520118713,
      "reward_std": 0.051454123109579086,
      "rewards/gemini_judge_reward_func/mean": 0.1104910746216774,
      "rewards/gemini_judge_reward_func/std": 0.23151718080043793,
      "rewards/semantic_correctness_reward_func/mean": 0.41151338815689087,
      "rewards/semantic_correctness_reward_func/std": 0.21736524999141693,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903892993927,
      "step": 797
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 929.0,
      "completions/mean_length": 172.7991180419922,
      "completions/mean_terminated_length": 145.3410186767578,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.27245956211856087,
      "grad_norm": 0.021433832123875618,
      "kl": 0.014792680740356445,
      "learning_rate": 1.2769089934176126e-06,
      "loss": 0.0023,
      "num_tokens": 284538724.0,
      "reward": 0.4125317931175232,
      "reward_std": 0.06591948866844177,
      "rewards/gemini_judge_reward_func/mean": 0.1216517835855484,
      "rewards/gemini_judge_reward_func/std": 0.23435331881046295,
      "rewards/semantic_correctness_reward_func/mean": 0.4251053035259247,
      "rewards/semantic_correctness_reward_func/std": 0.19413845241069794,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903594970703,
      "step": 798
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 629.0,
      "completions/mean_length": 165.05804443359375,
      "completions/mean_terminated_length": 153.398193359375,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.27280099014126585,
      "grad_norm": 0.02007019706070423,
      "kl": 0.0131683349609375,
      "learning_rate": 1.2658926150792321e-06,
      "loss": 0.0247,
      "num_tokens": 284890389.0,
      "reward": 0.43971216678619385,
      "reward_std": 0.06779628247022629,
      "rewards/gemini_judge_reward_func/mean": 0.1238839253783226,
      "rewards/gemini_judge_reward_func/std": 0.2615041136741638,
      "rewards/semantic_correctness_reward_func/mean": 0.4046053886413574,
      "rewards/semantic_correctness_reward_func/std": 0.21147848665714264,
      "rewards/xmlcount_reward_func/mean": 0.7730938196182251,
      "rewards/xmlcount_reward_func/std": 0.4206935167312622,
      "step": 799
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 599.0,
      "completions/mean_length": 174.90626525878906,
      "completions/mean_terminated_length": 155.52053833007812,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.27314241816397083,
      "grad_norm": 0.018950890749692917,
      "kl": 0.012112617492675781,
      "learning_rate": 1.2549170723409548e-06,
      "loss": 0.031,
      "num_tokens": 285250060.0,
      "reward": 0.44825732707977295,
      "reward_std": 0.07844427973031998,
      "rewards/gemini_judge_reward_func/mean": 0.1272321492433548,
      "rewards/gemini_judge_reward_func/std": 0.21174997091293335,
      "rewards/semantic_correctness_reward_func/mean": 0.43173304200172424,
      "rewards/semantic_correctness_reward_func/std": 0.20918342471122742,
      "rewards/xmlcount_reward_func/mean": 0.7775446772575378,
      "rewards/xmlcount_reward_func/std": 0.4613456428050995,
      "step": 800
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 314.0,
      "completions/mean_length": 174.5178680419922,
      "completions/mean_terminated_length": 147.11520385742188,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.27348384618667576,
      "grad_norm": 0.01994583196938038,
      "kl": 0.01393747329711914,
      "learning_rate": 1.243982485229559e-06,
      "loss": -0.0056,
      "num_tokens": 285638904.0,
      "reward": 0.42606374621391296,
      "reward_std": 0.06385260820388794,
      "rewards/gemini_judge_reward_func/mean": 0.1104910746216774,
      "rewards/gemini_judge_reward_func/std": 0.2512555420398712,
      "rewards/semantic_correctness_reward_func/mean": 0.41007763147354126,
      "rewards/semantic_correctness_reward_func/std": 0.207870215177536,
      "rewards/xmlcount_reward_func/mean": 0.749629557132721,
      "rewards/xmlcount_reward_func/std": 0.434091717004776,
      "step": 801
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 564.0,
      "completions/mean_length": 159.65179443359375,
      "completions/mean_terminated_length": 147.91856384277344,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.27382527420938074,
      "grad_norm": 0.019207848235964775,
      "kl": 0.013546228408813477,
      "learning_rate": 1.233088973323937e-06,
      "loss": -0.026,
      "num_tokens": 285992778.0,
      "reward": 0.45018380880355835,
      "reward_std": 0.06126423552632332,
      "rewards/gemini_judge_reward_func/mean": 0.1227678582072258,
      "rewards/gemini_judge_reward_func/std": 0.24027155339717865,
      "rewards/semantic_correctness_reward_func/mean": 0.46813318133354187,
      "rewards/semantic_correctness_reward_func/std": 0.20166510343551636,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 802
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 465.0,
      "completions/mean_length": 179.36607360839844,
      "completions/mean_terminated_length": 152.11981201171875,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.2741667022320857,
      "grad_norm": 0.019164903089404106,
      "kl": 0.011428117752075195,
      "learning_rate": 1.2222366557537911e-06,
      "loss": -0.0085,
      "num_tokens": 286324600.0,
      "reward": 0.4557950496673584,
      "reward_std": 0.06462844461202621,
      "rewards/gemini_judge_reward_func/mean": 0.1506696492433548,
      "rewards/gemini_judge_reward_func/std": 0.27591997385025024,
      "rewards/semantic_correctness_reward_func/mean": 0.4761357307434082,
      "rewards/semantic_correctness_reward_func/std": 0.2194724828004837,
      "rewards/xmlcount_reward_func/mean": 0.7507500648498535,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 803
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 922.0,
      "completions/mean_length": 163.40179443359375,
      "completions/mean_terminated_length": 159.5426025390625,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.27450813025479065,
      "grad_norm": 0.01984548196196556,
      "kl": 0.014154434204101562,
      "learning_rate": 1.2114256511983274e-06,
      "loss": 0.0074,
      "num_tokens": 286673866.0,
      "reward": 0.46019554138183594,
      "reward_std": 0.07883590459823608,
      "rewards/gemini_judge_reward_func/mean": 0.1529017835855484,
      "rewards/gemini_judge_reward_func/std": 0.2797389626502991,
      "rewards/semantic_correctness_reward_func/mean": 0.45792388916015625,
      "rewards/semantic_correctness_reward_func/std": 0.2234930694103241,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356839776039124,
      "step": 804
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 183.72769165039062,
      "completions/mean_terminated_length": 156.6221160888672,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.27484955827749563,
      "grad_norm": 0.019319184124469757,
      "kl": 0.013891935348510742,
      "learning_rate": 1.200656077884958e-06,
      "loss": 0.0075,
      "num_tokens": 287028617.0,
      "reward": 0.4133339822292328,
      "reward_std": 0.0628940612077713,
      "rewards/gemini_judge_reward_func/mean": 0.1216517835855484,
      "rewards/gemini_judge_reward_func/std": 0.2571618854999542,
      "rewards/semantic_correctness_reward_func/mean": 0.45144638419151306,
      "rewards/semantic_correctness_reward_func/std": 0.19957341253757477,
      "rewards/xmlcount_reward_func/mean": 0.6859598159790039,
      "rewards/xmlcount_reward_func/std": 0.46228134632110596,
      "step": 805
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 316.0,
      "completions/mean_length": 145.25894165039062,
      "completions/mean_terminated_length": 141.31838989257812,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.2751909863002006,
      "grad_norm": 0.020531706511974335,
      "kl": 0.013467550277709961,
      "learning_rate": 1.189928053588012e-06,
      "loss": 0.0089,
      "num_tokens": 287368943.0,
      "reward": 0.4259699881076813,
      "reward_std": 0.056370336562395096,
      "rewards/gemini_judge_reward_func/mean": 0.1160714253783226,
      "rewards/gemini_judge_reward_func/std": 0.23540008068084717,
      "rewards/semantic_correctness_reward_func/mean": 0.3827962875366211,
      "rewards/semantic_correctness_reward_func/std": 0.19811107218265533,
      "rewards/xmlcount_reward_func/mean": 0.7574554085731506,
      "rewards/xmlcount_reward_func/std": 0.4265342950820923,
      "step": 806
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 168.04019165039062,
      "completions/mean_terminated_length": 160.32882690429688,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.27553241432290554,
      "grad_norm": 0.020059313625097275,
      "kl": 0.012961864471435547,
      "learning_rate": 1.1792416956274443e-06,
      "loss": -0.0077,
      "num_tokens": 287728964.0,
      "reward": 0.4849930703639984,
      "reward_std": 0.08212324976921082,
      "rewards/gemini_judge_reward_func/mean": 0.1640625,
      "rewards/gemini_judge_reward_func/std": 0.27331385016441345,
      "rewards/semantic_correctness_reward_func/mean": 0.4470454156398773,
      "rewards/semantic_correctness_reward_func/std": 0.22025921940803528,
      "rewards/xmlcount_reward_func/mean": 0.8248973488807678,
      "rewards/xmlcount_reward_func/std": 0.3805646598339081,
      "step": 807
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 988.0,
      "completions/mean_length": 168.46429443359375,
      "completions/mean_terminated_length": 144.91741943359375,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.2758738423456105,
      "grad_norm": 0.020307661965489388,
      "kl": 0.01799488067626953,
      "learning_rate": 1.1685971208675539e-06,
      "loss": 0.0114,
      "num_tokens": 288099604.0,
      "reward": 0.4186796247959137,
      "reward_std": 0.0665237084031105,
      "rewards/gemini_judge_reward_func/mean": 0.1629464328289032,
      "rewards/gemini_judge_reward_func/std": 0.30907392501831055,
      "rewards/semantic_correctness_reward_func/mean": 0.46446940302848816,
      "rewards/semantic_correctness_reward_func/std": 0.24958083033561707,
      "rewards/xmlcount_reward_func/mean": 0.6515178680419922,
      "rewards/xmlcount_reward_func/std": 0.4737243354320526,
      "step": 808
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 407.0,
      "completions/mean_length": 156.15625,
      "completions/mean_terminated_length": 144.3755645751953,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.2762152703683155,
      "grad_norm": 0.020918505266308784,
      "kl": 0.014392375946044922,
      "learning_rate": 1.157994445715706e-06,
      "loss": 0.0052,
      "num_tokens": 288450511.0,
      "reward": 0.42883527278900146,
      "reward_std": 0.07986614853143692,
      "rewards/gemini_judge_reward_func/mean": 0.1305803507566452,
      "rewards/gemini_judge_reward_func/std": 0.235503688454628,
      "rewards/semantic_correctness_reward_func/mean": 0.45301559567451477,
      "rewards/semantic_correctness_reward_func/std": 0.2121279239654541,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 809
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 886.0,
      "completions/mean_length": 154.13394165039062,
      "completions/mean_terminated_length": 150.23318481445312,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.27655669839102043,
      "grad_norm": 0.0214835312217474,
      "kl": 0.012638568878173828,
      "learning_rate": 1.1474337861210543e-06,
      "loss": -0.03,
      "num_tokens": 288758333.0,
      "reward": 0.4833209812641144,
      "reward_std": 0.06771397590637207,
      "rewards/gemini_judge_reward_func/mean": 0.1651785671710968,
      "rewards/gemini_judge_reward_func/std": 0.277225136756897,
      "rewards/semantic_correctness_reward_func/mean": 0.450685054063797,
      "rewards/semantic_correctness_reward_func/std": 0.23370078206062317,
      "rewards/xmlcount_reward_func/mean": 0.8177813291549683,
      "rewards/xmlcount_reward_func/std": 0.3879494369029999,
      "step": 810
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 884.0,
      "completions/mean_length": 149.8928680419922,
      "completions/mean_terminated_length": 142.0180206298828,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.2768981264137254,
      "grad_norm": 0.020677506923675537,
      "kl": 0.018234729766845703,
      "learning_rate": 1.1369152575732823e-06,
      "loss": -0.0149,
      "num_tokens": 289114269.0,
      "reward": 0.4415244460105896,
      "reward_std": 0.07626640051603317,
      "rewards/gemini_judge_reward_func/mean": 0.1651785671710968,
      "rewards/gemini_judge_reward_func/std": 0.26901575922966003,
      "rewards/semantic_correctness_reward_func/mean": 0.4562382698059082,
      "rewards/semantic_correctness_reward_func/std": 0.20611201226711273,
      "rewards/xmlcount_reward_func/mean": 0.7105134725570679,
      "rewards/xmlcount_reward_func/std": 0.45530465245246887,
      "step": 811
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 250.0,
      "completions/mean_length": 163.88394165039062,
      "completions/mean_terminated_length": 144.24656677246094,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.2772395544364304,
      "grad_norm": 0.02087881974875927,
      "kl": 0.014058828353881836,
      "learning_rate": 1.1264389751013326e-06,
      "loss": -0.0299,
      "num_tokens": 289461251.0,
      "reward": 0.4769635498523712,
      "reward_std": 0.0741821750998497,
      "rewards/gemini_judge_reward_func/mean": 0.1696428507566452,
      "rewards/gemini_judge_reward_func/std": 0.2775498926639557,
      "rewards/semantic_correctness_reward_func/mean": 0.472531795501709,
      "rewards/semantic_correctness_reward_func/std": 0.21735930442810059,
      "rewards/xmlcount_reward_func/mean": 0.7864999771118164,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 812
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 165.35269165039062,
      "completions/mean_terminated_length": 161.5022430419922,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.2775809824591353,
      "grad_norm": 0.02140544354915619,
      "kl": 0.015886783599853516,
      "learning_rate": 1.1160050532721527e-06,
      "loss": 0.0098,
      "num_tokens": 289816638.0,
      "reward": 0.437183141708374,
      "reward_std": 0.06502541899681091,
      "rewards/gemini_judge_reward_func/mean": 0.1328125,
      "rewards/gemini_judge_reward_func/std": 0.2581525146961212,
      "rewards/semantic_correctness_reward_func/mean": 0.4504155218601227,
      "rewards/semantic_correctness_reward_func/std": 0.20866143703460693,
      "rewards/xmlcount_reward_func/mean": 0.7349375486373901,
      "rewards/xmlcount_reward_func/std": 0.4362303912639618,
      "step": 813
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 987.0,
      "completions/mean_length": 174.36607360839844,
      "completions/mean_terminated_length": 158.91818237304688,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.2779224104818403,
      "grad_norm": 0.019738927483558655,
      "kl": 0.014048099517822266,
      "learning_rate": 1.1056136061894386e-06,
      "loss": -0.0082,
      "num_tokens": 290180528.0,
      "reward": 0.41765350103378296,
      "reward_std": 0.04742603749036789,
      "rewards/gemini_judge_reward_func/mean": 0.0970982164144516,
      "rewards/gemini_judge_reward_func/std": 0.21256086230278015,
      "rewards/semantic_correctness_reward_func/mean": 0.4283210337162018,
      "rewards/semantic_correctness_reward_func/std": 0.20828229188919067,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 814
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 782.0,
      "completions/mean_length": 158.6875,
      "completions/mean_terminated_length": 142.9545440673828,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.2782638385045453,
      "grad_norm": 0.021320754662156105,
      "kl": 0.019411802291870117,
      "learning_rate": 1.095264747492391e-06,
      "loss": -0.0259,
      "num_tokens": 290586834.0,
      "reward": 0.38686153292655945,
      "reward_std": 0.07450807094573975,
      "rewards/gemini_judge_reward_func/mean": 0.1540178507566452,
      "rewards/gemini_judge_reward_func/std": 0.2533310353755951,
      "rewards/semantic_correctness_reward_func/mean": 0.4465217590332031,
      "rewards/semantic_correctness_reward_func/std": 0.1979120969772339,
      "rewards/xmlcount_reward_func/mean": 0.5898750424385071,
      "rewards/xmlcount_reward_func/std": 0.4935583770275116,
      "step": 815
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0357142857142857,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 871.0,
      "completions/mean_length": 187.71429443359375,
      "completions/mean_terminated_length": 156.74073791503906,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.2786052665272502,
      "grad_norm": 0.01968975178897381,
      "kl": 0.01842355728149414,
      "learning_rate": 1.0849585903544707e-06,
      "loss": 0.0035,
      "num_tokens": 290962134.0,
      "reward": 0.3895324170589447,
      "reward_std": 0.06561978906393051,
      "rewards/gemini_judge_reward_func/mean": 0.1261160671710968,
      "rewards/gemini_judge_reward_func/std": 0.23437467217445374,
      "rewards/semantic_correctness_reward_func/mean": 0.4061976969242096,
      "rewards/semantic_correctness_reward_func/std": 0.18816907703876495,
      "rewards/xmlcount_reward_func/mean": 0.6446161270141602,
      "rewards/xmlcount_reward_func/std": 0.4794987738132477,
      "step": 816
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 313.0,
      "completions/mean_length": 168.25,
      "completions/mean_terminated_length": 148.7123260498047,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.2789466945499552,
      "grad_norm": 0.02080092765390873,
      "kl": 0.013473033905029297,
      "learning_rate": 1.0746952474821615e-06,
      "loss": -0.0304,
      "num_tokens": 291327398.0,
      "reward": 0.43678271770477295,
      "reward_std": 0.07601462304592133,
      "rewards/gemini_judge_reward_func/mean": 0.1473214328289032,
      "rewards/gemini_judge_reward_func/std": 0.25067970156669617,
      "rewards/semantic_correctness_reward_func/mean": 0.4235205352306366,
      "rewards/semantic_correctness_reward_func/std": 0.24099822342395782,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 817
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 945.0,
      "completions/mean_length": 169.82144165039062,
      "completions/mean_terminated_length": 158.22625732421875,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.27928812257266017,
      "grad_norm": 0.020462460815906525,
      "kl": 0.014742612838745117,
      "learning_rate": 1.0644748311137377e-06,
      "loss": -0.0084,
      "num_tokens": 291660130.0,
      "reward": 0.46902474761009216,
      "reward_std": 0.06322862952947617,
      "rewards/gemini_judge_reward_func/mean": 0.1729910671710968,
      "rewards/gemini_judge_reward_func/std": 0.29747891426086426,
      "rewards/semantic_correctness_reward_func/mean": 0.4574181139469147,
      "rewards/semantic_correctness_reward_func/std": 0.2231023907661438,
      "rewards/xmlcount_reward_func/mean": 0.7708616852760315,
      "rewards/xmlcount_reward_func/std": 0.42080527544021606,
      "step": 818
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 618.0,
      "completions/mean_length": 170.60269165039062,
      "completions/mean_terminated_length": 155.08636474609375,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.2796295505953651,
      "grad_norm": 0.01898844726383686,
      "kl": 0.011361122131347656,
      "learning_rate": 1.0542974530180327e-06,
      "loss": -0.0136,
      "num_tokens": 291994705.0,
      "reward": 0.45434191823005676,
      "reward_std": 0.07022686302661896,
      "rewards/gemini_judge_reward_func/mean": 0.1573660671710968,
      "rewards/gemini_judge_reward_func/std": 0.2812555730342865,
      "rewards/semantic_correctness_reward_func/mean": 0.45547717809677124,
      "rewards/semantic_correctness_reward_func/std": 0.22370651364326477,
      "rewards/xmlcount_reward_func/mean": 0.7507500648498535,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 819
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 759.0,
      "completions/mean_length": 162.15625,
      "completions/mean_terminated_length": 154.3918914794922,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "epoch": 0.2799709786180701,
      "grad_norm": 0.01947224698960781,
      "kl": 0.011575698852539062,
      "learning_rate": 1.0441632244932238e-06,
      "loss": -0.0028,
      "num_tokens": 292289820.0,
      "reward": 0.4589068293571472,
      "reward_std": 0.05952470749616623,
      "rewards/gemini_judge_reward_func/mean": 0.09375,
      "rewards/gemini_judge_reward_func/std": 0.2160203754901886,
      "rewards/semantic_correctness_reward_func/mean": 0.3910340368747711,
      "rewards/semantic_correctness_reward_func/std": 0.21046021580696106,
      "rewards/xmlcount_reward_func/mean": 0.8579999804496765,
      "rewards/xmlcount_reward_func/std": 0.35106155276298523,
      "step": 820
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 434.0,
      "completions/mean_length": 156.95982360839844,
      "completions/mean_terminated_length": 137.1643829345703,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.28031240664077506,
      "grad_norm": 0.02017991617321968,
      "kl": 0.01740121841430664,
      "learning_rate": 1.0340722563656109e-06,
      "loss": -0.0061,
      "num_tokens": 292632279.0,
      "reward": 0.4119788706302643,
      "reward_std": 0.05955352261662483,
      "rewards/gemini_judge_reward_func/mean": 0.1149553582072258,
      "rewards/gemini_judge_reward_func/std": 0.24354811012744904,
      "rewards/semantic_correctness_reward_func/mean": 0.4357335567474365,
      "rewards/semantic_correctness_reward_func/std": 0.20957152545452118,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903892993927,
      "step": 821
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 349.0,
      "completions/mean_length": 153.30357360839844,
      "completions/mean_terminated_length": 145.45945739746094,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.28065383466348,
      "grad_norm": 0.020473873242735863,
      "kl": 0.013295650482177734,
      "learning_rate": 1.0240246589884046e-06,
      "loss": -0.0091,
      "num_tokens": 292982703.0,
      "reward": 0.44188037514686584,
      "reward_std": 0.04769199714064598,
      "rewards/gemini_judge_reward_func/mean": 0.1171875,
      "rewards/gemini_judge_reward_func/std": 0.22941070795059204,
      "rewards/semantic_correctness_reward_func/mean": 0.4377768635749817,
      "rewards/semantic_correctness_reward_func/std": 0.21695400774478912,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 822
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 508.0,
      "completions/mean_length": 164.54464721679688,
      "completions/mean_terminated_length": 148.91818237304688,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.28099526268618497,
      "grad_norm": 0.019360825419425964,
      "kl": 0.012386083602905273,
      "learning_rate": 1.0140205422405213e-06,
      "loss": 0.0227,
      "num_tokens": 293337929.0,
      "reward": 0.4652153551578522,
      "reward_std": 0.07463736832141876,
      "rewards/gemini_judge_reward_func/mean": 0.1953125,
      "rewards/gemini_judge_reward_func/std": 0.31242993474006653,
      "rewards/semantic_correctness_reward_func/mean": 0.46970170736312866,
      "rewards/semantic_correctness_reward_func/std": 0.24221506714820862,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 823
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 397.0,
      "completions/max_terminated_length": 397.0,
      "completions/mean_length": 144.9866180419922,
      "completions/mean_terminated_length": 144.9866180419922,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.28133669070888995,
      "grad_norm": 0.021534979343414307,
      "kl": 0.013988494873046875,
      "learning_rate": 1.0040600155253766e-06,
      "loss": 0.0079,
      "num_tokens": 293665650.0,
      "reward": 0.4723190665245056,
      "reward_std": 0.07654394209384918,
      "rewards/gemini_judge_reward_func/mean": 0.1651785671710968,
      "rewards/gemini_judge_reward_func/std": 0.27519577741622925,
      "rewards/semantic_correctness_reward_func/mean": 0.45823797583580017,
      "rewards/semantic_correctness_reward_func/std": 0.22768716514110565,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 824
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 957.0,
      "completions/max_terminated_length": 957.0,
      "completions/mean_length": 161.45089721679688,
      "completions/mean_terminated_length": 161.45089721679688,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.2816781187315949,
      "grad_norm": 0.01974865049123764,
      "kl": 0.013636112213134766,
      "learning_rate": 9.941431877696955e-07,
      "loss": -0.0092,
      "num_tokens": 293982823.0,
      "reward": 0.44613802433013916,
      "reward_std": 0.0767451748251915,
      "rewards/gemini_judge_reward_func/mean": 0.1160714253783226,
      "rewards/gemini_judge_reward_func/std": 0.23058846592903137,
      "rewards/semantic_correctness_reward_func/mean": 0.4361810088157654,
      "rewards/semantic_correctness_reward_func/std": 0.19925597310066223,
      "rewards/xmlcount_reward_func/mean": 0.7811830639839172,
      "rewards/xmlcount_reward_func/std": 0.41245442628860474,
      "step": 825
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 679.0,
      "completions/mean_length": 171.91964721679688,
      "completions/mean_terminated_length": 148.46788024902344,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.28201954675429985,
      "grad_norm": 0.019685156643390656,
      "kl": 0.013269186019897461,
      "learning_rate": 9.842701674223187e-07,
      "loss": -0.0294,
      "num_tokens": 294355533.0,
      "reward": 0.41046252846717834,
      "reward_std": 0.05996851623058319,
      "rewards/gemini_judge_reward_func/mean": 0.1171875,
      "rewards/gemini_judge_reward_func/std": 0.2354399412870407,
      "rewards/semantic_correctness_reward_func/mean": 0.41921430826187134,
      "rewards/semantic_correctness_reward_func/std": 0.21170702576637268,
      "rewards/xmlcount_reward_func/mean": 0.6993616819381714,
      "rewards/xmlcount_reward_func/std": 0.4591045379638672,
      "step": 826
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 957.0,
      "completions/mean_length": 154.86607360839844,
      "completions/mean_terminated_length": 147.03604125976562,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.28236097477700484,
      "grad_norm": 0.020932121202349663,
      "kl": 0.015183448791503906,
      "learning_rate": 9.744410624530148e-07,
      "loss": -0.0218,
      "num_tokens": 294719891.0,
      "reward": 0.4267961084842682,
      "reward_std": 0.07028646767139435,
      "rewards/gemini_judge_reward_func/mean": 0.140625,
      "rewards/gemini_judge_reward_func/std": 0.2523016333580017,
      "rewards/semantic_correctness_reward_func/mean": 0.45848026871681213,
      "rewards/semantic_correctness_reward_func/std": 0.20311670005321503,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903594970703,
      "step": 827
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 307.0,
      "completions/mean_length": 168.8928680419922,
      "completions/mean_terminated_length": 145.3577880859375,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.28270240279970976,
      "grad_norm": 0.020107265561819077,
      "kl": 0.013638973236083984,
      "learning_rate": 9.646559803512995e-07,
      "loss": 0.0013,
      "num_tokens": 295080995.0,
      "reward": 0.44543105363845825,
      "reward_std": 0.06815174221992493,
      "rewards/gemini_judge_reward_func/mean": 0.1428571492433548,
      "rewards/gemini_judge_reward_func/std": 0.2476668804883957,
      "rewards/semantic_correctness_reward_func/mean": 0.43994078040122986,
      "rewards/semantic_correctness_reward_func/std": 0.21600624918937683,
      "rewards/xmlcount_reward_func/mean": 0.7507500648498535,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 828
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 578.0,
      "completions/mean_length": 169.49107360839844,
      "completions/mean_terminated_length": 153.9545440673828,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.28304383082241474,
      "grad_norm": 0.019049208611249924,
      "kl": 0.015591621398925781,
      "learning_rate": 9.549150281252633e-07,
      "loss": 0.0164,
      "num_tokens": 295424597.0,
      "reward": 0.45486655831336975,
      "reward_std": 0.08084883540868759,
      "rewards/gemini_judge_reward_func/mean": 0.1450892835855484,
      "rewards/gemini_judge_reward_func/std": 0.2607302665710449,
      "rewards/semantic_correctness_reward_func/mean": 0.44690415263175964,
      "rewards/semantic_correctness_reward_func/std": 0.22606715559959412,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 829
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 980.0,
      "completions/mean_length": 173.16964721679688,
      "completions/mean_terminated_length": 157.6999969482422,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.2833852588451197,
      "grad_norm": 0.01913963072001934,
      "kl": 0.015146732330322266,
      "learning_rate": 9.452183123003999e-07,
      "loss": 0.0101,
      "num_tokens": 295787895.0,
      "reward": 0.45005136728286743,
      "reward_std": 0.06961911916732788,
      "rewards/gemini_judge_reward_func/mean": 0.1752232164144516,
      "rewards/gemini_judge_reward_func/std": 0.30639660358428955,
      "rewards/semantic_correctness_reward_func/mean": 0.43075647950172424,
      "rewards/semantic_correctness_reward_func/std": 0.2312474101781845,
      "rewards/xmlcount_reward_func/mean": 0.7345268130302429,
      "rewards/xmlcount_reward_func/std": 0.4422244727611542,
      "step": 830
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 433.0,
      "completions/mean_length": 183.26339721679688,
      "completions/mean_terminated_length": 156.14285278320312,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.28372668686782465,
      "grad_norm": 0.01923030987381935,
      "kl": 0.011376142501831055,
      "learning_rate": 9.355659389184396e-07,
      "loss": -0.0029,
      "num_tokens": 296141650.0,
      "reward": 0.4269668757915497,
      "reward_std": 0.06150417774915695,
      "rewards/gemini_judge_reward_func/mean": 0.1060267835855484,
      "rewards/gemini_judge_reward_func/std": 0.20415377616882324,
      "rewards/semantic_correctness_reward_func/mean": 0.4212806522846222,
      "rewards/semantic_correctness_reward_func/std": 0.19276918470859528,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 831
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 362.0,
      "completions/mean_length": 171.87054443359375,
      "completions/mean_terminated_length": 152.41551208496094,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.28406811489052963,
      "grad_norm": 0.020839476957917213,
      "kl": 0.012931585311889648,
      "learning_rate": 9.259580135361929e-07,
      "loss": 0.0077,
      "num_tokens": 296490013.0,
      "reward": 0.4934826195240021,
      "reward_std": 0.07880446314811707,
      "rewards/gemini_judge_reward_func/mean": 0.1395089328289032,
      "rewards/gemini_judge_reward_func/std": 0.25567853450775146,
      "rewards/semantic_correctness_reward_func/mean": 0.44558241963386536,
      "rewards/semantic_correctness_reward_func/std": 0.19386854767799377,
      "rewards/xmlcount_reward_func/mean": 0.8714062571525574,
      "rewards/xmlcount_reward_func/std": 0.3368014693260193,
      "step": 832
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 344.0,
      "completions/mean_length": 142.20089721679688,
      "completions/mean_terminated_length": 138.24664306640625,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.2844095429132346,
      "grad_norm": 0.021701615303754807,
      "kl": 0.01783442497253418,
      "learning_rate": 9.163946412243896e-07,
      "loss": -0.0085,
      "num_tokens": 296849162.0,
      "reward": 0.4567524492740631,
      "reward_std": 0.08476989716291428,
      "rewards/gemini_judge_reward_func/mean": 0.1618303507566452,
      "rewards/gemini_judge_reward_func/std": 0.29147952795028687,
      "rewards/semantic_correctness_reward_func/mean": 0.4586014151573181,
      "rewards/semantic_correctness_reward_func/std": 0.21884453296661377,
      "rewards/xmlcount_reward_func/mean": 0.7507500648498535,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 833
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 456.0,
      "completions/mean_length": 161.02232360839844,
      "completions/mean_terminated_length": 153.2477569580078,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.28475097093593954,
      "grad_norm": 0.02000286616384983,
      "kl": 0.015344858169555664,
      "learning_rate": 9.068759265665384e-07,
      "loss": 0.0039,
      "num_tokens": 297207363.0,
      "reward": 0.45448118448257446,
      "reward_std": 0.05085495859384537,
      "rewards/gemini_judge_reward_func/mean": 0.0770089253783226,
      "rewards/gemini_judge_reward_func/std": 0.18472008407115936,
      "rewards/semantic_correctness_reward_func/mean": 0.4046199917793274,
      "rewards/semantic_correctness_reward_func/std": 0.20661711692810059,
      "rewards/xmlcount_reward_func/mean": 0.8568840026855469,
      "rewards/xmlcount_reward_func/std": 0.3510022759437561,
      "step": 834
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 996.0,
      "completions/mean_length": 164.46429443359375,
      "completions/mean_terminated_length": 148.83636474609375,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.2850923989586445,
      "grad_norm": 0.020461585372686386,
      "kl": 0.013503074645996094,
      "learning_rate": 8.974019736577777e-07,
      "loss": -0.0198,
      "num_tokens": 297569715.0,
      "reward": 0.4008234739303589,
      "reward_std": 0.0664874017238617,
      "rewards/gemini_judge_reward_func/mean": 0.1205357164144516,
      "rewards/gemini_judge_reward_func/std": 0.21303418278694153,
      "rewards/semantic_correctness_reward_func/mean": 0.4402957856655121,
      "rewards/semantic_correctness_reward_func/std": 0.21150822937488556,
      "rewards/xmlcount_reward_func/mean": 0.6613750457763672,
      "rewards/xmlcount_reward_func/std": 0.47500187158584595,
      "step": 835
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 557.0,
      "completions/mean_length": 163.07589721679688,
      "completions/mean_terminated_length": 143.4200897216797,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.2854338269813495,
      "grad_norm": 0.023171184584498405,
      "kl": 0.014976263046264648,
      "learning_rate": 8.879728861037385e-07,
      "loss": -0.012,
      "num_tokens": 297925140.0,
      "reward": 0.3934459984302521,
      "reward_std": 0.07260891795158386,
      "rewards/gemini_judge_reward_func/mean": 0.1116071417927742,
      "rewards/gemini_judge_reward_func/std": 0.2242058366537094,
      "rewards/semantic_correctness_reward_func/mean": 0.4212654232978821,
      "rewards/semantic_correctness_reward_func/std": 0.19929270446300507,
      "rewards/xmlcount_reward_func/mean": 0.6613750457763672,
      "rewards/xmlcount_reward_func/std": 0.47500187158584595,
      "step": 836
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 716.0,
      "completions/mean_length": 174.25894165039062,
      "completions/mean_terminated_length": 158.80908203125,
      "completions/min_length": 32.0,
      "completions/min_terminated_length": 32.0,
      "epoch": 0.2857752550040545,
      "grad_norm": 0.01915401965379715,
      "kl": 0.014138221740722656,
      "learning_rate": 8.785887670194137e-07,
      "loss": 0.0121,
      "num_tokens": 298242194.0,
      "reward": 0.4870355427265167,
      "reward_std": 0.07406332343816757,
      "rewards/gemini_judge_reward_func/mean": 0.1439732164144516,
      "rewards/gemini_judge_reward_func/std": 0.25647982954978943,
      "rewards/semantic_correctness_reward_func/mean": 0.4312308430671692,
      "rewards/semantic_correctness_reward_func/std": 0.2138284593820572,
      "rewards/xmlcount_reward_func/mean": 0.8580000996589661,
      "rewards/xmlcount_reward_func/std": 0.35106155276298523,
      "step": 837
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 599.0,
      "completions/max_terminated_length": 599.0,
      "completions/mean_length": 149.5,
      "completions/mean_terminated_length": 149.5,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.2861166830267594,
      "grad_norm": 0.020606782287359238,
      "kl": 0.014037609100341797,
      "learning_rate": 8.692497190280225e-07,
      "loss": -0.0364,
      "num_tokens": 298575282.0,
      "reward": 0.42692282795906067,
      "reward_std": 0.0603804774582386,
      "rewards/gemini_judge_reward_func/mean": 0.0904017835855484,
      "rewards/gemini_judge_reward_func/std": 0.2089034467935562,
      "rewards/semantic_correctness_reward_func/mean": 0.3808102607727051,
      "rewards/semantic_correctness_reward_func/std": 0.22411176562309265,
      "rewards/xmlcount_reward_func/mean": 0.786500096321106,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 838
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 935.0,
      "completions/mean_length": 163.34375,
      "completions/mean_terminated_length": 155.590087890625,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.2864581110494644,
      "grad_norm": 0.020060038194060326,
      "kl": 0.01865839958190918,
      "learning_rate": 8.599558442598998e-07,
      "loss": 0.0025,
      "num_tokens": 298949891.0,
      "reward": 0.41716110706329346,
      "reward_std": 0.06800390779972076,
      "rewards/gemini_judge_reward_func/mean": 0.1462053507566452,
      "rewards/gemini_judge_reward_func/std": 0.25848188996315,
      "rewards/semantic_correctness_reward_func/mean": 0.4352964758872986,
      "rewards/semantic_correctness_reward_func/std": 0.2010316401720047,
      "rewards/xmlcount_reward_func/mean": 0.6790491342544556,
      "rewards/xmlcount_reward_func/std": 0.46593108773231506,
      "step": 839
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 449.0,
      "completions/mean_length": 163.66964721679688,
      "completions/mean_terminated_length": 148.0272674560547,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.2867995390721694,
      "grad_norm": 0.023369010537862778,
      "kl": 0.015680789947509766,
      "learning_rate": 8.507072443513703e-07,
      "loss": -0.0068,
      "num_tokens": 299298193.0,
      "reward": 0.4394191801548004,
      "reward_std": 0.048864465206861496,
      "rewards/gemini_judge_reward_func/mean": 0.1071428582072258,
      "rewards/gemini_judge_reward_func/std": 0.23128196597099304,
      "rewards/semantic_correctness_reward_func/mean": 0.4455600380897522,
      "rewards/semantic_correctness_reward_func/std": 0.21113541722297668,
      "rewards/xmlcount_reward_func/mean": 0.7686249613761902,
      "rewards/xmlcount_reward_func/std": 0.42356839776039124,
      "step": 840
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 990.0,
      "completions/mean_length": 164.74107360839844,
      "completions/mean_terminated_length": 160.8878936767578,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.2871409670948743,
      "grad_norm": 0.020501412451267242,
      "kl": 0.013971805572509766,
      "learning_rate": 8.415040204436426e-07,
      "loss": 0.0172,
      "num_tokens": 299640531.0,
      "reward": 0.44586053490638733,
      "reward_std": 0.059171684086322784,
      "rewards/gemini_judge_reward_func/mean": 0.0993303582072258,
      "rewards/gemini_judge_reward_func/std": 0.2047901600599289,
      "rewards/semantic_correctness_reward_func/mean": 0.4397847056388855,
      "rewards/semantic_correctness_reward_func/std": 0.20271818339824677,
      "rewards/xmlcount_reward_func/mean": 0.7954286336898804,
      "rewards/xmlcount_reward_func/std": 0.3996967077255249,
      "step": 841
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 468.0,
      "completions/mean_length": 164.39732360839844,
      "completions/mean_terminated_length": 156.6531524658203,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.2874823951175793,
      "grad_norm": 0.020911818370223045,
      "kl": 0.011243343353271484,
      "learning_rate": 8.323462731816962e-07,
      "loss": -0.0332,
      "num_tokens": 299943028.0,
      "reward": 0.46584782004356384,
      "reward_std": 0.06846728920936584,
      "rewards/gemini_judge_reward_func/mean": 0.1183035746216774,
      "rewards/gemini_judge_reward_func/std": 0.24595338106155396,
      "rewards/semantic_correctness_reward_func/mean": 0.41238173842430115,
      "rewards/semantic_correctness_reward_func/std": 0.2328694760799408,
      "rewards/xmlcount_reward_func/mean": 0.8401250839233398,
      "rewards/xmlcount_reward_func/std": 0.3684578835964203,
      "step": 842
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 352.0,
      "completions/mean_length": 163.625,
      "completions/mean_terminated_length": 151.94570922851562,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.28782382314028426,
      "grad_norm": 0.021682532504200935,
      "kl": 0.015304327011108398,
      "learning_rate": 8.232341027131885e-07,
      "loss": -0.0226,
      "num_tokens": 300254040.0,
      "reward": 0.46195971965789795,
      "reward_std": 0.049539245665073395,
      "rewards/gemini_judge_reward_func/mean": 0.1071428582072258,
      "rewards/gemini_judge_reward_func/std": 0.1915077120065689,
      "rewards/semantic_correctness_reward_func/mean": 0.4152628481388092,
      "rewards/semantic_correctness_reward_func/std": 0.20335595309734344,
      "rewards/xmlcount_reward_func/mean": 0.8401250839233398,
      "rewards/xmlcount_reward_func/std": 0.3684578835964203,
      "step": 843
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 932.0,
      "completions/mean_length": 183.6607208251953,
      "completions/mean_terminated_length": 160.5321044921875,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "epoch": 0.2881652511629892,
      "grad_norm": 0.019986923784017563,
      "kl": 0.01100611686706543,
      "learning_rate": 8.141676086873574e-07,
      "loss": -0.0132,
      "num_tokens": 300626396.0,
      "reward": 0.43232661485671997,
      "reward_std": 0.058139532804489136,
      "rewards/gemini_judge_reward_func/mean": 0.0982142835855484,
      "rewards/gemini_judge_reward_func/std": 0.19909308850765228,
      "rewards/semantic_correctness_reward_func/mean": 0.3922043442726135,
      "rewards/semantic_correctness_reward_func/std": 0.19739094376564026,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 844
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 157.40625,
      "completions/mean_terminated_length": 141.64999389648438,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.28850667918569417,
      "grad_norm": 0.02015194483101368,
      "kl": 0.016405105590820312,
      "learning_rate": 8.051468902539272e-07,
      "loss": -0.0271,
      "num_tokens": 300980835.0,
      "reward": 0.4009341299533844,
      "reward_std": 0.06801611930131912,
      "rewards/gemini_judge_reward_func/mean": 0.1428571492433548,
      "rewards/gemini_judge_reward_func/std": 0.2476668804883957,
      "rewards/semantic_correctness_reward_func/mean": 0.4286705553531647,
      "rewards/semantic_correctness_reward_func/std": 0.22438682615756989,
      "rewards/xmlcount_reward_func/mean": 0.6451429128646851,
      "rewards/xmlcount_reward_func/std": 0.4791279733181,
      "step": 845
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 856.0,
      "completions/mean_length": 166.5982208251953,
      "completions/mean_terminated_length": 154.95928955078125,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.28884810720839915,
      "grad_norm": 0.01968112215399742,
      "kl": 0.013155221939086914,
      "learning_rate": 7.961720460620321e-07,
      "loss": -0.008,
      "num_tokens": 301317749.0,
      "reward": 0.47892674803733826,
      "reward_std": 0.06558345258235931,
      "rewards/gemini_judge_reward_func/mean": 0.1395089328289032,
      "rewards/gemini_judge_reward_func/std": 0.2433219850063324,
      "rewards/semantic_correctness_reward_func/mean": 0.47111573815345764,
      "rewards/semantic_correctness_reward_func/std": 0.19663625955581665,
      "rewards/xmlcount_reward_func/mean": 0.8222500681877136,
      "rewards/xmlcount_reward_func/std": 0.3842346966266632,
      "step": 846
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 547.0,
      "completions/mean_length": 177.89732360839844,
      "completions/mean_terminated_length": 150.6036834716797,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.2891895352311041,
      "grad_norm": 0.020076138898730278,
      "kl": 0.015607357025146484,
      "learning_rate": 7.872431742591268e-07,
      "loss": -0.033,
      "num_tokens": 301673162.0,
      "reward": 0.3954106271266937,
      "reward_std": 0.06636636704206467,
      "rewards/gemini_judge_reward_func/mean": 0.1060267835855484,
      "rewards/gemini_judge_reward_func/std": 0.24871297180652618,
      "rewards/semantic_correctness_reward_func/mean": 0.4311065077781677,
      "rewards/semantic_correctness_reward_func/std": 0.18301746249198914,
      "rewards/xmlcount_reward_func/mean": 0.6669464707374573,
      "rewards/xmlcount_reward_func/std": 0.4685778319835663,
      "step": 847
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 863.0,
      "completions/mean_length": 161.6607208251953,
      "completions/mean_terminated_length": 149.9547576904297,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.28953096325380906,
      "grad_norm": 0.02056868001818657,
      "kl": 0.01586151123046875,
      "learning_rate": 7.783603724899258e-07,
      "loss": 0.0144,
      "num_tokens": 302045542.0,
      "reward": 0.44777730107307434,
      "reward_std": 0.06656524538993835,
      "rewards/gemini_judge_reward_func/mean": 0.1618303507566452,
      "rewards/gemini_judge_reward_func/std": 0.2631829082965851,
      "rewards/semantic_correctness_reward_func/mean": 0.48522552847862244,
      "rewards/semantic_correctness_reward_func/std": 0.22041460871696472,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 848
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 842.0,
      "completions/mean_length": 160.20982360839844,
      "completions/mean_terminated_length": 148.4841766357422,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.28987239127651404,
      "grad_norm": 0.020083541050553322,
      "kl": 0.01417398452758789,
      "learning_rate": 7.695237378953224e-07,
      "loss": 0.0159,
      "num_tokens": 302391601.0,
      "reward": 0.4260123372077942,
      "reward_std": 0.06106026470661163,
      "rewards/gemini_judge_reward_func/mean": 0.1283482164144516,
      "rewards/gemini_judge_reward_func/std": 0.20637255907058716,
      "rewards/semantic_correctness_reward_func/mean": 0.4433649182319641,
      "rewards/semantic_correctness_reward_func/std": 0.1907728761434555,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 849
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 582.0,
      "completions/mean_length": 172.33929443359375,
      "completions/mean_terminated_length": 148.89907836914062,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.29021381929921897,
      "grad_norm": 0.019677992910146713,
      "kl": 0.014250516891479492,
      "learning_rate": 7.607333671113409e-07,
      "loss": 0.0118,
      "num_tokens": 302743377.0,
      "reward": 0.4396704435348511,
      "reward_std": 0.06315828114748001,
      "rewards/gemini_judge_reward_func/mean": 0.1696428507566452,
      "rewards/gemini_judge_reward_func/std": 0.292304664850235,
      "rewards/semantic_correctness_reward_func/mean": 0.46481624245643616,
      "rewards/semantic_correctness_reward_func/std": 0.1968490481376648,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903594970703,
      "step": 850
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 659.0,
      "completions/mean_length": 181.64732360839844,
      "completions/mean_terminated_length": 158.4633026123047,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.29055524732192395,
      "grad_norm": 0.0194232277572155,
      "kl": 0.012831687927246094,
      "learning_rate": 7.519893562680663e-07,
      "loss": 0.0072,
      "num_tokens": 303115110.0,
      "reward": 0.46728041768074036,
      "reward_std": 0.07786200195550919,
      "rewards/gemini_judge_reward_func/mean": 0.1607142835855484,
      "rewards/gemini_judge_reward_func/std": 0.2717183828353882,
      "rewards/semantic_correctness_reward_func/mean": 0.43975013494491577,
      "rewards/semantic_correctness_reward_func/std": 0.21617595851421356,
      "rewards/xmlcount_reward_func/mean": 0.7876116633415222,
      "rewards/xmlcount_reward_func/std": 0.4077901840209961,
      "step": 851
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 300.0,
      "completions/mean_length": 158.8482208251953,
      "completions/mean_terminated_length": 143.11817932128906,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.29089667534462893,
      "grad_norm": 0.01978623867034912,
      "kl": 0.012966394424438477,
      "learning_rate": 7.432918009885997e-07,
      "loss": 0.0013,
      "num_tokens": 303451772.0,
      "reward": 0.4542922377586365,
      "reward_std": 0.0721825510263443,
      "rewards/gemini_judge_reward_func/mean": 0.1540178507566452,
      "rewards/gemini_judge_reward_func/std": 0.2816223204135895,
      "rewards/semantic_correctness_reward_func/mean": 0.4619251787662506,
      "rewards/semantic_correctness_reward_func/std": 0.22607234120368958,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 852
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 578.0,
      "completions/mean_length": 166.27679443359375,
      "completions/mean_terminated_length": 154.63348388671875,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.29123810336733386,
      "grad_norm": 0.021079065278172493,
      "kl": 0.013202190399169922,
      "learning_rate": 7.346407963880137e-07,
      "loss": -0.0072,
      "num_tokens": 303776266.0,
      "reward": 0.45635682344436646,
      "reward_std": 0.06529436260461807,
      "rewards/gemini_judge_reward_func/mean": 0.1194196417927742,
      "rewards/gemini_judge_reward_func/std": 0.2169191539287567,
      "rewards/semantic_correctness_reward_func/mean": 0.43419477343559265,
      "rewards/semantic_correctness_reward_func/std": 0.19115614891052246,
      "rewards/xmlcount_reward_func/mean": 0.8043751120567322,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 853
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 799.0,
      "completions/mean_length": 168.6919708251953,
      "completions/mean_terminated_length": 149.1643829345703,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.29157953139003884,
      "grad_norm": 0.019034838303923607,
      "kl": 0.01335906982421875,
      "learning_rate": 7.260364370723044e-07,
      "loss": 0.0358,
      "num_tokens": 304163093.0,
      "reward": 0.389647513628006,
      "reward_std": 0.06260879337787628,
      "rewards/gemini_judge_reward_func/mean": 0.1171875,
      "rewards/gemini_judge_reward_func/std": 0.24705736339092255,
      "rewards/semantic_correctness_reward_func/mean": 0.4223891794681549,
      "rewards/semantic_correctness_reward_func/std": 0.2090659886598587,
      "rewards/xmlcount_reward_func/mean": 0.6457366943359375,
      "rewards/xmlcount_reward_func/std": 0.4788653254508972,
      "step": 854
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 826.0,
      "completions/mean_length": 160.6294708251953,
      "completions/mean_terminated_length": 156.7578582763672,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.2919209594127438,
      "grad_norm": 0.020094826817512512,
      "kl": 0.012228012084960938,
      "learning_rate": 7.174788171373731e-07,
      "loss": -0.0181,
      "num_tokens": 304475666.0,
      "reward": 0.44959086179733276,
      "reward_std": 0.06874435395002365,
      "rewards/gemini_judge_reward_func/mean": 0.1138392835855484,
      "rewards/gemini_judge_reward_func/std": 0.2341102659702301,
      "rewards/semantic_correctness_reward_func/mean": 0.41152557730674744,
      "rewards/semantic_correctness_reward_func/std": 0.21029044687747955,
      "rewards/xmlcount_reward_func/mean": 0.8043750524520874,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 855
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 302.0,
      "completions/mean_length": 156.6294708251953,
      "completions/mean_terminated_length": 140.8590850830078,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.29226238743544874,
      "grad_norm": 0.020419996231794357,
      "kl": 0.01521611213684082,
      "learning_rate": 7.089680301679752e-07,
      "loss": -0.0014,
      "num_tokens": 304836359.0,
      "reward": 0.4453238248825073,
      "reward_std": 0.061911653727293015,
      "rewards/gemini_judge_reward_func/mean": 0.1607142835855484,
      "rewards/gemini_judge_reward_func/std": 0.2622709274291992,
      "rewards/semantic_correctness_reward_func/mean": 0.43944045901298523,
      "rewards/semantic_correctness_reward_func/std": 0.21513807773590088,
      "rewards/xmlcount_reward_func/mean": 0.7328750491142273,
      "rewards/xmlcount_reward_func/std": 0.44427841901779175,
      "step": 856
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 314.0,
      "completions/mean_length": 159.61607360839844,
      "completions/mean_terminated_length": 143.89999389648438,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.2926038154581537,
      "grad_norm": 0.028872739523649216,
      "kl": 0.01840972900390625,
      "learning_rate": 7.005041692367154e-07,
      "loss": 0.0281,
      "num_tokens": 305184333.0,
      "reward": 0.4945422112941742,
      "reward_std": 0.08059635013341904,
      "rewards/gemini_judge_reward_func/mean": 0.1640625,
      "rewards/gemini_judge_reward_func/std": 0.27535709738731384,
      "rewards/semantic_correctness_reward_func/mean": 0.46433597803115845,
      "rewards/semantic_correctness_reward_func/std": 0.21889406442642212,
      "rewards/xmlcount_reward_func/mean": 0.8401250839233398,
      "rewards/xmlcount_reward_func/std": 0.3684578537940979,
      "step": 857
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0357142857142857,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1019.0,
      "completions/mean_length": 187.21876525878906,
      "completions/mean_terminated_length": 156.2268524169922,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.2929452434808587,
      "grad_norm": 0.020401984453201294,
      "kl": 0.012844085693359375,
      "learning_rate": 6.92087326903022e-07,
      "loss": 0.007,
      "num_tokens": 305534010.0,
      "reward": 0.4454714357852936,
      "reward_std": 0.06435173749923706,
      "rewards/gemini_judge_reward_func/mean": 0.140625,
      "rewards/gemini_judge_reward_func/std": 0.25888094305992126,
      "rewards/semantic_correctness_reward_func/mean": 0.48035693168640137,
      "rewards/semantic_correctness_reward_func/std": 0.20532502233982086,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 858
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1001.0,
      "completions/mean_length": 167.4375,
      "completions/mean_terminated_length": 155.80996704101562,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "epoch": 0.29328667150356363,
      "grad_norm": 0.019876059144735336,
      "kl": 0.014787673950195312,
      "learning_rate": 6.837175952121305e-07,
      "loss": -0.0128,
      "num_tokens": 305878924.0,
      "reward": 0.42637744545936584,
      "reward_std": 0.06756948679685593,
      "rewards/gemini_judge_reward_func/mean": 0.1183035746216774,
      "rewards/gemini_judge_reward_func/std": 0.2269900143146515,
      "rewards/semantic_correctness_reward_func/mean": 0.4116818606853485,
      "rewards/semantic_correctness_reward_func/std": 0.18583944439888,
      "rewards/xmlcount_reward_func/mean": 0.7417991757392883,
      "rewards/xmlcount_reward_func/std": 0.4394584894180298,
      "step": 859
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 783.0,
      "completions/mean_length": 164.16519165039062,
      "completions/mean_terminated_length": 148.5318145751953,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.2936280995262686,
      "grad_norm": 0.019963495433330536,
      "kl": 0.014317989349365234,
      "learning_rate": 6.753950656940905e-07,
      "loss": -0.0166,
      "num_tokens": 306243169.0,
      "reward": 0.4247058033943176,
      "reward_std": 0.07365789264440536,
      "rewards/gemini_judge_reward_func/mean": 0.1261160671710968,
      "rewards/gemini_judge_reward_func/std": 0.2740635871887207,
      "rewards/semantic_correctness_reward_func/mean": 0.41675207018852234,
      "rewards/semantic_correctness_reward_func/std": 0.21209114789962769,
      "rewards/xmlcount_reward_func/mean": 0.7272723913192749,
      "rewards/xmlcount_reward_func/std": 0.44621461629867554,
      "step": 860
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 567.0,
      "completions/mean_length": 152.4553680419922,
      "completions/mean_terminated_length": 148.54708862304688,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.2939695275489736,
      "grad_norm": 0.018703632056713104,
      "kl": 0.01490640640258789,
      "learning_rate": 6.671198293627479e-07,
      "loss": -0.0362,
      "num_tokens": 306606443.0,
      "reward": 0.4184475243091583,
      "reward_std": 0.07583961635828018,
      "rewards/gemini_judge_reward_func/mean": 0.171875,
      "rewards/gemini_judge_reward_func/std": 0.2710452377796173,
      "rewards/semantic_correctness_reward_func/mean": 0.46148738265037537,
      "rewards/semantic_correctness_reward_func/std": 0.20480482280254364,
      "rewards/xmlcount_reward_func/mean": 0.643500030040741,
      "rewards/xmlcount_reward_func/std": 0.48071083426475525,
      "step": 861
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 169.95982360839844,
      "completions/mean_terminated_length": 154.4318084716797,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.2943109555716785,
      "grad_norm": 0.02081831730902195,
      "kl": 0.013816595077514648,
      "learning_rate": 6.58891976714764e-07,
      "loss": -0.0332,
      "num_tokens": 306977126.0,
      "reward": 0.428364098072052,
      "reward_std": 0.06174404174089432,
      "rewards/gemini_judge_reward_func/mean": 0.1049107164144516,
      "rewards/gemini_judge_reward_func/std": 0.21344491839408875,
      "rewards/semantic_correctness_reward_func/mean": 0.43049880862236023,
      "rewards/semantic_correctness_reward_func/std": 0.19661079347133636,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 862
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 837.0,
      "completions/mean_length": 176.09376525878906,
      "completions/mean_terminated_length": 156.7351531982422,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.2946523835943835,
      "grad_norm": 0.01923784427344799,
      "kl": 0.014006614685058594,
      "learning_rate": 6.507115977286144e-07,
      "loss": 0.0059,
      "num_tokens": 307345083.0,
      "reward": 0.4206071197986603,
      "reward_std": 0.060593608766794205,
      "rewards/gemini_judge_reward_func/mean": 0.1149553582072258,
      "rewards/gemini_judge_reward_func/std": 0.23416103422641754,
      "rewards/semantic_correctness_reward_func/mean": 0.40737470984458923,
      "rewards/semantic_correctness_reward_func/std": 0.19159357249736786,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 863
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 267.0,
      "completions/mean_length": 150.45982360839844,
      "completions/mean_terminated_length": 138.60182189941406,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.2949938116170885,
      "grad_norm": 0.02324904315173626,
      "kl": 0.01671123504638672,
      "learning_rate": 6.425787818636131e-07,
      "loss": 0.0013,
      "num_tokens": 307693414.0,
      "reward": 0.4323778748512268,
      "reward_std": 0.0673980787396431,
      "rewards/gemini_judge_reward_func/mean": 0.1272321492433548,
      "rewards/gemini_judge_reward_func/std": 0.2471712976694107,
      "rewards/semantic_correctness_reward_func/mean": 0.4416749179363251,
      "rewards/semantic_correctness_reward_func/std": 0.20484818518161774,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 864
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 567.0,
      "completions/mean_length": 157.45089721679688,
      "completions/mean_terminated_length": 145.6877899169922,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.2953352396397934,
      "grad_norm": 0.02026812545955181,
      "kl": 0.017922401428222656,
      "learning_rate": 6.34493618058935e-07,
      "loss": -0.0241,
      "num_tokens": 308038595.0,
      "reward": 0.40504324436187744,
      "reward_std": 0.0582125298678875,
      "rewards/gemini_judge_reward_func/mean": 0.0870535746216774,
      "rewards/gemini_judge_reward_func/std": 0.1900254338979721,
      "rewards/semantic_correctness_reward_func/mean": 0.43004655838012695,
      "rewards/semantic_correctness_reward_func/std": 0.18637752532958984,
      "rewards/xmlcount_reward_func/mean": 0.7105312943458557,
      "rewards/xmlcount_reward_func/std": 0.4553159773349762,
      "step": 865
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 965.0,
      "completions/mean_length": 151.52232360839844,
      "completions/mean_terminated_length": 135.65908813476562,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.2956766676624984,
      "grad_norm": 0.02160787023603916,
      "kl": 0.016959190368652344,
      "learning_rate": 6.264561947326331e-07,
      "loss": -0.0499,
      "num_tokens": 308428480.0,
      "reward": 0.38458970189094543,
      "reward_std": 0.061651427298784256,
      "rewards/gemini_judge_reward_func/mean": 0.1495535671710968,
      "rewards/gemini_judge_reward_func/std": 0.2810530662536621,
      "rewards/semantic_correctness_reward_func/mean": 0.44409117102622986,
      "rewards/semantic_correctness_reward_func/std": 0.2333114594221115,
      "rewards/xmlcount_reward_func/mean": 0.5898750424385071,
      "rewards/xmlcount_reward_func/std": 0.493558406829834,
      "step": 866
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 600.0,
      "completions/mean_length": 149.66964721679688,
      "completions/mean_terminated_length": 137.8009033203125,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.2960180956852034,
      "grad_norm": 0.022309862077236176,
      "kl": 0.020073890686035156,
      "learning_rate": 6.184665997806832e-07,
      "loss": 0.0107,
      "num_tokens": 308800722.0,
      "reward": 0.41277799010276794,
      "reward_std": 0.0709206610918045,
      "rewards/gemini_judge_reward_func/mean": 0.1205357164144516,
      "rewards/gemini_judge_reward_func/std": 0.2195136845111847,
      "rewards/semantic_correctness_reward_func/mean": 0.4352737367153168,
      "rewards/semantic_correctness_reward_func/std": 0.21140582859516144,
      "rewards/xmlcount_reward_func/mean": 0.6937723755836487,
      "rewards/xmlcount_reward_func/std": 0.46180078387260437,
      "step": 867
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 654.0,
      "completions/mean_length": 173.76339721679688,
      "completions/mean_terminated_length": 158.30453491210938,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.2963595237079083,
      "grad_norm": 0.021554123610258102,
      "kl": 0.012543916702270508,
      "learning_rate": 6.105249205760128e-07,
      "loss": 0.0002,
      "num_tokens": 309163189.0,
      "reward": 0.4783555567264557,
      "reward_std": 0.0800277516245842,
      "rewards/gemini_judge_reward_func/mean": 0.1618303507566452,
      "rewards/gemini_judge_reward_func/std": 0.2705346941947937,
      "rewards/semantic_correctness_reward_func/mean": 0.45045629143714905,
      "rewards/semantic_correctness_reward_func/std": 0.23706389963626862,
      "rewards/xmlcount_reward_func/mean": 0.8088303804397583,
      "rewards/xmlcount_reward_func/std": 0.39227935671806335,
      "step": 868
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 541.0,
      "completions/mean_length": 166.58929443359375,
      "completions/mean_terminated_length": 154.95022583007812,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.2967009517306133,
      "grad_norm": 0.019824448972940445,
      "kl": 0.017318248748779297,
      "learning_rate": 6.026312439675553e-07,
      "loss": -0.015,
      "num_tokens": 309522953.0,
      "reward": 0.47017693519592285,
      "reward_std": 0.08199930191040039,
      "rewards/gemini_judge_reward_func/mean": 0.1473214328289032,
      "rewards/gemini_judge_reward_func/std": 0.26269039511680603,
      "rewards/semantic_correctness_reward_func/mean": 0.44304510951042175,
      "rewards/semantic_correctness_reward_func/std": 0.22514069080352783,
      "rewards/xmlcount_reward_func/mean": 0.8065982460975647,
      "rewards/xmlcount_reward_func/std": 0.3925861418247223,
      "step": 869
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 785.0,
      "completions/mean_length": 162.86607360839844,
      "completions/mean_terminated_length": 151.17648315429688,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.29704237975331826,
      "grad_norm": 0.019513197243213654,
      "kl": 0.016964197158813477,
      "learning_rate": 5.947856562792926e-07,
      "loss": -0.0283,
      "num_tokens": 309917055.0,
      "reward": 0.4131915271282196,
      "reward_std": 0.058818139135837555,
      "rewards/gemini_judge_reward_func/mean": 0.1160714253783226,
      "rewards/gemini_judge_reward_func/std": 0.23658768832683563,
      "rewards/semantic_correctness_reward_func/mean": 0.4277253746986389,
      "rewards/semantic_correctness_reward_func/std": 0.21168087422847748,
      "rewards/xmlcount_reward_func/mean": 0.7030447721481323,
      "rewards/xmlcount_reward_func/std": 0.45517581701278687,
      "step": 870
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 586.0,
      "completions/mean_length": 163.6294708251953,
      "completions/mean_terminated_length": 151.95022583007812,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.2973838077760232,
      "grad_norm": 0.02505846694111824,
      "kl": 0.01614856719970703,
      "learning_rate": 5.869882433093154e-07,
      "loss": -0.0025,
      "num_tokens": 310251404.0,
      "reward": 0.469322144985199,
      "reward_std": 0.06780051440000534,
      "rewards/gemini_judge_reward_func/mean": 0.1674107164144516,
      "rewards/gemini_judge_reward_func/std": 0.28190651535987854,
      "rewards/semantic_correctness_reward_func/mean": 0.4745390713214874,
      "rewards/semantic_correctness_reward_func/std": 0.21615885198116302,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 871
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 629.0,
      "completions/mean_length": 163.66964721679688,
      "completions/mean_terminated_length": 144.0273895263672,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.29772523579872817,
      "grad_norm": 0.01972603239119053,
      "kl": 0.014445066452026367,
      "learning_rate": 5.79239090328883e-07,
      "loss": 0.0015,
      "num_tokens": 310608642.0,
      "reward": 0.44004756212234497,
      "reward_std": 0.062194082885980606,
      "rewards/gemini_judge_reward_func/mean": 0.1383928507566452,
      "rewards/gemini_judge_reward_func/std": 0.2837909758090973,
      "rewards/semantic_correctness_reward_func/mean": 0.45770174264907837,
      "rewards/semantic_correctness_reward_func/std": 0.23461376130580902,
      "rewards/xmlcount_reward_func/mean": 0.7328750491142273,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 872
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 417.0,
      "completions/max_terminated_length": 417.0,
      "completions/mean_length": 151.84375,
      "completions/mean_terminated_length": 151.84375,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.29806666382143315,
      "grad_norm": 0.020589305087924004,
      "kl": 0.012288570404052734,
      "learning_rate": 5.715382820814885e-07,
      "loss": -0.0382,
      "num_tokens": 310927507.0,
      "reward": 0.46137532591819763,
      "reward_std": 0.06986761838197708,
      "rewards/gemini_judge_reward_func/mean": 0.1082589253783226,
      "rewards/gemini_judge_reward_func/std": 0.21504218876361847,
      "rewards/semantic_correctness_reward_func/mean": 0.4101085662841797,
      "rewards/semantic_correctness_reward_func/std": 0.22632215917110443,
      "rewards/xmlcount_reward_func/mean": 0.8401250839233398,
      "rewards/xmlcount_reward_func/std": 0.3684578835964203,
      "step": 873
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 384.0,
      "completions/mean_length": 171.75001525878906,
      "completions/mean_terminated_length": 144.25807189941406,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.29840809184413813,
      "grad_norm": 0.018754450604319572,
      "kl": 0.014352798461914062,
      "learning_rate": 5.63885902781941e-07,
      "loss": -0.0132,
      "num_tokens": 311314675.0,
      "reward": 0.42426401376724243,
      "reward_std": 0.06676241010427475,
      "rewards/gemini_judge_reward_func/mean": 0.1160714253783226,
      "rewards/gemini_judge_reward_func/std": 0.24243849515914917,
      "rewards/semantic_correctness_reward_func/mean": 0.43821272253990173,
      "rewards/semantic_correctness_reward_func/std": 0.20891118049621582,
      "rewards/xmlcount_reward_func/mean": 0.725482165813446,
      "rewards/xmlcount_reward_func/std": 0.4461726248264313,
      "step": 874
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 825.0,
      "completions/max_terminated_length": 825.0,
      "completions/mean_length": 151.74554443359375,
      "completions/mean_terminated_length": 151.74554443359375,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.29874951986684306,
      "grad_norm": 0.021069129928946495,
      "kl": 0.01585555076599121,
      "learning_rate": 5.562820361154315e-07,
      "loss": 0.0016,
      "num_tokens": 311651818.0,
      "reward": 0.44300538301467896,
      "reward_std": 0.06342455744743347,
      "rewards/gemini_judge_reward_func/mean": 0.1216517835855484,
      "rewards/gemini_judge_reward_func/std": 0.23554620146751404,
      "rewards/semantic_correctness_reward_func/mean": 0.4344729781150818,
      "rewards/semantic_correctness_reward_func/std": 0.20860819518566132,
      "rewards/xmlcount_reward_func/mean": 0.7686249613761902,
      "rewards/xmlcount_reward_func/std": 0.42356839776039124,
      "step": 875
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0357142857142857,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1020.0,
      "completions/mean_length": 179.93304443359375,
      "completions/mean_terminated_length": 148.67129516601562,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.29909094788954804,
      "grad_norm": 0.020150672644376755,
      "kl": 0.014079093933105469,
      "learning_rate": 5.487267652366291e-07,
      "loss": 0.0032,
      "num_tokens": 312022119.0,
      "reward": 0.3856308162212372,
      "reward_std": 0.05329553782939911,
      "rewards/gemini_judge_reward_func/mean": 0.0948660746216774,
      "rewards/gemini_judge_reward_func/std": 0.20417827367782593,
      "rewards/semantic_correctness_reward_func/mean": 0.4156718850135803,
      "rewards/semantic_correctness_reward_func/std": 0.19628028571605682,
      "rewards/xmlcount_reward_func/mean": 0.6613750457763672,
      "rewards/xmlcount_reward_func/std": 0.47500187158584595,
      "step": 876
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 636.0,
      "completions/mean_length": 169.14732360839844,
      "completions/mean_terminated_length": 145.6192626953125,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.299432375912253,
      "grad_norm": 0.019851071760058403,
      "kl": 0.012566089630126953,
      "learning_rate": 5.412201727687644e-07,
      "loss": -0.0112,
      "num_tokens": 312423944.0,
      "reward": 0.39708369970321655,
      "reward_std": 0.07032705843448639,
      "rewards/gemini_judge_reward_func/mean": 0.140625,
      "rewards/gemini_judge_reward_func/std": 0.25888094305992126,
      "rewards/semantic_correctness_reward_func/mean": 0.4261060357093811,
      "rewards/semantic_correctness_reward_func/std": 0.19139282405376434,
      "rewards/xmlcount_reward_func/mean": 0.6390312910079956,
      "rewards/xmlcount_reward_func/std": 0.4820234477519989,
      "step": 877
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 915.0,
      "completions/max_terminated_length": 915.0,
      "completions/mean_length": 147.0044708251953,
      "completions/mean_terminated_length": 147.0044708251953,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.29977380393495795,
      "grad_norm": 0.01827196218073368,
      "kl": 0.014067649841308594,
      "learning_rate": 5.337623408027293e-07,
      "loss": -0.0067,
      "num_tokens": 312785133.0,
      "reward": 0.4566745162010193,
      "reward_std": 0.07833483070135117,
      "rewards/gemini_judge_reward_func/mean": 0.1841517835855484,
      "rewards/gemini_judge_reward_func/std": 0.31206128001213074,
      "rewards/semantic_correctness_reward_func/mean": 0.4940064251422882,
      "rewards/semantic_correctness_reward_func/std": 0.22719837725162506,
      "rewards/xmlcount_reward_func/mean": 0.7105312943458557,
      "rewards/xmlcount_reward_func/std": 0.4553159773349762,
      "step": 878
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 917.0,
      "completions/mean_length": 170.375,
      "completions/mean_terminated_length": 154.8545379638672,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.30011523195766293,
      "grad_norm": 0.019717033952474594,
      "kl": 0.013404130935668945,
      "learning_rate": 5.263533508961827e-07,
      "loss": -0.0193,
      "num_tokens": 313157273.0,
      "reward": 0.4236012399196625,
      "reward_std": 0.07436075061559677,
      "rewards/gemini_judge_reward_func/mean": 0.1104910746216774,
      "rewards/gemini_judge_reward_func/std": 0.23512084782123566,
      "rewards/semantic_correctness_reward_func/mean": 0.4133989214897156,
      "rewards/semantic_correctness_reward_func/std": 0.21528121829032898,
      "rewards/xmlcount_reward_func/mean": 0.7418125867843628,
      "rewards/xmlcount_reward_func/std": 0.4394664168357849,
      "step": 879
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 266.0,
      "completions/mean_length": 157.05357360839844,
      "completions/mean_terminated_length": 137.26026916503906,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.3004566599803679,
      "grad_norm": 0.020437972620129585,
      "kl": 0.017246723175048828,
      "learning_rate": 5.189932840726486e-07,
      "loss": -0.02,
      "num_tokens": 313499205.0,
      "reward": 0.43167340755462646,
      "reward_std": 0.06233254447579384,
      "rewards/gemini_judge_reward_func/mean": 0.1160714253783226,
      "rewards/gemini_judge_reward_func/std": 0.23540008068084717,
      "rewards/semantic_correctness_reward_func/mean": 0.433661550283432,
      "rewards/semantic_correctness_reward_func/std": 0.20578143000602722,
      "rewards/xmlcount_reward_func/mean": 0.7462812662124634,
      "rewards/xmlcount_reward_func/std": 0.4369716942310333,
      "step": 880
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 336.0,
      "completions/mean_length": 153.95982360839844,
      "completions/mean_terminated_length": 150.0583038330078,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.30079808800307284,
      "grad_norm": 0.020293962210416794,
      "kl": 0.010478973388671875,
      "learning_rate": 5.116822208206396e-07,
      "loss": -0.0048,
      "num_tokens": 313856116.0,
      "reward": 0.48650670051574707,
      "reward_std": 0.07343684136867523,
      "rewards/gemini_judge_reward_func/mean": 0.1540178507566452,
      "rewards/gemini_judge_reward_func/std": 0.26096048951148987,
      "rewards/semantic_correctness_reward_func/mean": 0.41867607831954956,
      "rewards/semantic_correctness_reward_func/std": 0.22737175226211548,
      "rewards/xmlcount_reward_func/mean": 0.8529108166694641,
      "rewards/xmlcount_reward_func/std": 0.3554360866546631,
      "step": 881
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 280.0,
      "completions/mean_length": 152.30357360839844,
      "completions/mean_terminated_length": 144.45045471191406,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.3011395160257778,
      "grad_norm": 0.021946420893073082,
      "kl": 0.011649847030639648,
      "learning_rate": 5.044202410927707e-07,
      "loss": -0.023,
      "num_tokens": 314198824.0,
      "reward": 0.5047093629837036,
      "reward_std": 0.06970416009426117,
      "rewards/gemini_judge_reward_func/mean": 0.2142857164144516,
      "rewards/gemini_judge_reward_func/std": 0.31650617718696594,
      "rewards/semantic_correctness_reward_func/mean": 0.4862251281738281,
      "rewards/semantic_correctness_reward_func/std": 0.24020838737487793,
      "rewards/xmlcount_reward_func/mean": 0.8043750524520874,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 882
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 692.0,
      "completions/mean_length": 163.53125,
      "completions/mean_terminated_length": 151.85069274902344,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.3014809440484828,
      "grad_norm": 0.01995939202606678,
      "kl": 0.013335943222045898,
      "learning_rate": 4.972074243048896e-07,
      "loss": -0.0016,
      "num_tokens": 314553251.0,
      "reward": 0.4273732602596283,
      "reward_std": 0.05411674082279205,
      "rewards/gemini_judge_reward_func/mean": 0.1205357164144516,
      "rewards/gemini_judge_reward_func/std": 0.2527475357055664,
      "rewards/semantic_correctness_reward_func/mean": 0.39429473876953125,
      "rewards/semantic_correctness_reward_func/std": 0.22735366225242615,
      "rewards/xmlcount_reward_func/mean": 0.7507500648498535,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 883
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0357142857142857,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 618.0,
      "completions/mean_length": 185.68304443359375,
      "completions/mean_terminated_length": 154.63426208496094,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.30182237207118773,
      "grad_norm": 0.018948886543512344,
      "kl": 0.013762235641479492,
      "learning_rate": 4.900438493352056e-07,
      "loss": 0.0087,
      "num_tokens": 314895628.0,
      "reward": 0.47533664107322693,
      "reward_std": 0.06594642996788025,
      "rewards/gemini_judge_reward_func/mean": 0.15625,
      "rewards/gemini_judge_reward_func/std": 0.2843547463417053,
      "rewards/semantic_correctness_reward_func/mean": 0.45543310046195984,
      "rewards/semantic_correctness_reward_func/std": 0.23065660893917084,
      "rewards/xmlcount_reward_func/mean": 0.8043751120567322,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 884
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 294.0,
      "completions/mean_length": 162.8794708251953,
      "completions/mean_terminated_length": 147.22271728515625,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.3021638000938927,
      "grad_norm": 0.022125041112303734,
      "kl": 0.012331485748291016,
      "learning_rate": 4.829295945234258e-07,
      "loss": -0.0208,
      "num_tokens": 315235093.0,
      "reward": 0.45227399468421936,
      "reward_std": 0.07823660224676132,
      "rewards/gemini_judge_reward_func/mean": 0.1462053507566452,
      "rewards/gemini_judge_reward_func/std": 0.24740150570869446,
      "rewards/semantic_correctness_reward_func/mean": 0.45180743932724,
      "rewards/semantic_correctness_reward_func/std": 0.20693226158618927,
      "rewards/xmlcount_reward_func/mean": 0.7585759162902832,
      "rewards/xmlcount_reward_func/std": 0.42353785037994385,
      "step": 885
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 946.0,
      "completions/mean_length": 180.0491180419922,
      "completions/mean_terminated_length": 152.82489013671875,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.3025052281165977,
      "grad_norm": 0.01876814290881157,
      "kl": 0.015291213989257812,
      "learning_rate": 4.758647376699033e-07,
      "loss": -0.0157,
      "num_tokens": 315648760.0,
      "reward": 0.37088969349861145,
      "reward_std": 0.06254278868436813,
      "rewards/gemini_judge_reward_func/mean": 0.1227678582072258,
      "rewards/gemini_judge_reward_func/std": 0.2414351999759674,
      "rewards/semantic_correctness_reward_func/mean": 0.4635911285877228,
      "rewards/semantic_correctness_reward_func/std": 0.21879442036151886,
      "rewards/xmlcount_reward_func/mean": 0.5726607441902161,
      "rewards/xmlcount_reward_func/std": 0.49444088339805603,
      "step": 886
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 769.0,
      "completions/mean_length": 158.46429443359375,
      "completions/mean_terminated_length": 150.6666717529297,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "epoch": 0.3028466561393026,
      "grad_norm": 0.019580967724323273,
      "kl": 0.01501321792602539,
      "learning_rate": 4.6884935603477733e-07,
      "loss": -0.0064,
      "num_tokens": 316007392.0,
      "reward": 0.4427121579647064,
      "reward_std": 0.07204774022102356,
      "rewards/gemini_judge_reward_func/mean": 0.1607142835855484,
      "rewards/gemini_judge_reward_func/std": 0.28283706307411194,
      "rewards/semantic_correctness_reward_func/mean": 0.4430249035358429,
      "rewards/semantic_correctness_reward_func/std": 0.24078968167304993,
      "rewards/xmlcount_reward_func/mean": 0.7245535850524902,
      "rewards/xmlcount_reward_func/std": 0.44617968797683716,
      "step": 887
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 524.0,
      "completions/mean_length": 174.3794708251953,
      "completions/mean_terminated_length": 146.97235107421875,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.3031880841620076,
      "grad_norm": 0.020197952166199684,
      "kl": 0.01589512825012207,
      "learning_rate": 4.6188352633713964e-07,
      "loss": -0.005,
      "num_tokens": 316382981.0,
      "reward": 0.4027620255947113,
      "reward_std": 0.06222005560994148,
      "rewards/gemini_judge_reward_func/mean": 0.109375,
      "rewards/gemini_judge_reward_func/std": 0.23385359346866608,
      "rewards/semantic_correctness_reward_func/mean": 0.40059566497802734,
      "rewards/semantic_correctness_reward_func/std": 0.21668115258216858,
      "rewards/xmlcount_reward_func/mean": 0.697232186794281,
      "rewards/xmlcount_reward_func/std": 0.5156688094139099,
      "step": 888
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 937.0,
      "completions/mean_length": 170.54464721679688,
      "completions/mean_terminated_length": 147.05503845214844,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.3035295121847126,
      "grad_norm": 0.020838545635342598,
      "kl": 0.01421976089477539,
      "learning_rate": 4.549673247541875e-07,
      "loss": 0.0222,
      "num_tokens": 316754535.0,
      "reward": 0.4516361355781555,
      "reward_std": 0.060274988412857056,
      "rewards/gemini_judge_reward_func/mean": 0.1551339328289032,
      "rewards/gemini_judge_reward_func/std": 0.2723967432975769,
      "rewards/semantic_correctness_reward_func/mean": 0.4464126229286194,
      "rewards/semantic_correctness_reward_func/std": 0.22379936277866364,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 889
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 951.0,
      "completions/mean_length": 155.49554443359375,
      "completions/mean_terminated_length": 151.6009063720703,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.3038709402074175,
      "grad_norm": 0.022012127563357353,
      "kl": 0.013429880142211914,
      "learning_rate": 4.48100826920394e-07,
      "loss": -0.0292,
      "num_tokens": 317093814.0,
      "reward": 0.452115535736084,
      "reward_std": 0.07477650046348572,
      "rewards/gemini_judge_reward_func/mean": 0.1417410671710968,
      "rewards/gemini_judge_reward_func/std": 0.2396954447031021,
      "rewards/semantic_correctness_reward_func/mean": 0.43247920274734497,
      "rewards/semantic_correctness_reward_func/std": 0.19742132723331451,
      "rewards/xmlcount_reward_func/mean": 0.7723080515861511,
      "rewards/xmlcount_reward_func/std": 0.41869527101516724,
      "step": 890
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 978.0,
      "completions/mean_length": 171.50894165039062,
      "completions/mean_terminated_length": 152.045654296875,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.3042123682301225,
      "grad_norm": 0.023126568645238876,
      "kl": 0.01566934585571289,
      "learning_rate": 4.412841079266778e-07,
      "loss": -0.0598,
      "num_tokens": 317443944.0,
      "reward": 0.4357885718345642,
      "reward_std": 0.07174669206142426,
      "rewards/gemini_judge_reward_func/mean": 0.1272321492433548,
      "rewards/gemini_judge_reward_func/std": 0.23674629628658295,
      "rewards/semantic_correctness_reward_func/mean": 0.42297855019569397,
      "rewards/semantic_correctness_reward_func/std": 0.2082798182964325,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 891
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 437.0,
      "completions/mean_length": 153.9419708251953,
      "completions/mean_terminated_length": 142.1312255859375,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.30455379625282747,
      "grad_norm": 0.018571894615888596,
      "kl": 0.010985612869262695,
      "learning_rate": 4.345172423195865e-07,
      "loss": 0.0101,
      "num_tokens": 317798447.0,
      "reward": 0.41238030791282654,
      "reward_std": 0.061363838613033295,
      "rewards/gemini_judge_reward_func/mean": 0.1328125,
      "rewards/gemini_judge_reward_func/std": 0.23898446559906006,
      "rewards/semantic_correctness_reward_func/mean": 0.4377765357494354,
      "rewards/semantic_correctness_reward_func/std": 0.21453407406806946,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 892
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 743.0,
      "completions/mean_length": 154.39732360839844,
      "completions/mean_terminated_length": 142.5927734375,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.3048952242755324,
      "grad_norm": 0.020391054451465607,
      "kl": 0.017475605010986328,
      "learning_rate": 4.27800304100478e-07,
      "loss": 0.0157,
      "num_tokens": 318175080.0,
      "reward": 0.43031346797943115,
      "reward_std": 0.06380556523799896,
      "rewards/gemini_judge_reward_func/mean": 0.140625,
      "rewards/gemini_judge_reward_func/std": 0.2511883080005646,
      "rewards/semantic_correctness_reward_func/mean": 0.4581742286682129,
      "rewards/semantic_correctness_reward_func/std": 0.23177775740623474,
      "rewards/xmlcount_reward_func/mean": 0.7060714364051819,
      "rewards/xmlcount_reward_func/std": 0.4524170458316803,
      "step": 893
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 572.0,
      "completions/mean_length": 168.71875,
      "completions/mean_terminated_length": 153.16818237304688,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.3052366522982374,
      "grad_norm": 0.02003057673573494,
      "kl": 0.013791561126708984,
      "learning_rate": 4.211333667247125e-07,
      "loss": -0.0267,
      "num_tokens": 318535501.0,
      "reward": 0.44697102904319763,
      "reward_std": 0.06763234734535217,
      "rewards/gemini_judge_reward_func/mean": 0.1506696492433548,
      "rewards/gemini_judge_reward_func/std": 0.269756555557251,
      "rewards/semantic_correctness_reward_func/mean": 0.4677656590938568,
      "rewards/semantic_correctness_reward_func/std": 0.21983444690704346,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 894
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 751.0,
      "completions/mean_length": 156.6919708251953,
      "completions/mean_terminated_length": 144.91856384277344,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.30557808032094236,
      "grad_norm": 0.020025836303830147,
      "kl": 0.01305532455444336,
      "learning_rate": 4.1451650310085076e-07,
      "loss": -0.0072,
      "num_tokens": 318903816.0,
      "reward": 0.44203463196754456,
      "reward_std": 0.06274479627609253,
      "rewards/gemini_judge_reward_func/mean": 0.1216517835855484,
      "rewards/gemini_judge_reward_func/std": 0.2460220754146576,
      "rewards/semantic_correctness_reward_func/mean": 0.46536940336227417,
      "rewards/semantic_correctness_reward_func/std": 0.21820604801177979,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 895
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 636.0,
      "completions/mean_length": 164.19644165039062,
      "completions/mean_terminated_length": 152.52488708496094,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.3059195083436473,
      "grad_norm": 0.020183706656098366,
      "kl": 0.014451742172241211,
      "learning_rate": 4.079497855898501e-07,
      "loss": -0.0157,
      "num_tokens": 319248500.0,
      "reward": 0.4057226777076721,
      "reward_std": 0.05680568888783455,
      "rewards/gemini_judge_reward_func/mean": 0.0904017835855484,
      "rewards/gemini_judge_reward_func/std": 0.20755748450756073,
      "rewards/semantic_correctness_reward_func/mean": 0.4178095757961273,
      "rewards/semantic_correctness_reward_func/std": 0.18284156918525696,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 896
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 688.0,
      "completions/mean_length": 165.16519165039062,
      "completions/mean_terminated_length": 153.5067901611328,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.30626093636635227,
      "grad_norm": 0.0204419307410717,
      "kl": 0.013719558715820312,
      "learning_rate": 4.01433286004283e-07,
      "loss": -0.0104,
      "num_tokens": 319595289.0,
      "reward": 0.47336649894714355,
      "reward_std": 0.07261441648006439,
      "rewards/gemini_judge_reward_func/mean": 0.1261160671710968,
      "rewards/gemini_judge_reward_func/std": 0.23793506622314453,
      "rewards/semantic_correctness_reward_func/mean": 0.4623859226703644,
      "rewards/semantic_correctness_reward_func/std": 0.20957084000110626,
      "rewards/xmlcount_reward_func/mean": 0.8261072039604187,
      "rewards/xmlcount_reward_func/std": 0.37810245156288147,
      "step": 897
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 509.0,
      "completions/mean_length": 157.9107208251953,
      "completions/mean_terminated_length": 150.1081085205078,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.30660236438905725,
      "grad_norm": 0.021409234032034874,
      "kl": 0.012285232543945312,
      "learning_rate": 3.949670756075447e-07,
      "loss": -0.0011,
      "num_tokens": 319936469.0,
      "reward": 0.49861517548561096,
      "reward_std": 0.05451667681336403,
      "rewards/gemini_judge_reward_func/mean": 0.1473214328289032,
      "rewards/gemini_judge_reward_func/std": 0.21190352737903595,
      "rewards/semantic_correctness_reward_func/mean": 0.4489149749279022,
      "rewards/semantic_correctness_reward_func/std": 0.19751466810703278,
      "rewards/xmlcount_reward_func/mean": 0.8747590184211731,
      "rewards/xmlcount_reward_func/std": 0.3317887485027313,
      "step": 898
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1017.0,
      "completions/mean_length": 176.21429443359375,
      "completions/mean_terminated_length": 156.8584442138672,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.3069437924117622,
      "grad_norm": 0.020069124177098274,
      "kl": 0.012425422668457031,
      "learning_rate": 3.885512251130763e-07,
      "loss": -0.0324,
      "num_tokens": 320298373.0,
      "reward": 0.43729156255722046,
      "reward_std": 0.05971502512693405,
      "rewards/gemini_judge_reward_func/mean": 0.1138392835855484,
      "rewards/gemini_judge_reward_func/std": 0.19918103516101837,
      "rewards/semantic_correctness_reward_func/mean": 0.43493086099624634,
      "rewards/semantic_correctness_reward_func/std": 0.1836792379617691,
      "rewards/xmlcount_reward_func/mean": 0.7619242072105408,
      "rewards/xmlcount_reward_func/std": 0.4237978458404541,
      "step": 899
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 937.0,
      "completions/mean_length": 160.74554443359375,
      "completions/mean_terminated_length": 145.04998779296875,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.30728522043446715,
      "grad_norm": 0.02074616402387619,
      "kl": 0.015354156494140625,
      "learning_rate": 3.8218580468359136e-07,
      "loss": 0.0332,
      "num_tokens": 320666908.0,
      "reward": 0.40854790806770325,
      "reward_std": 0.061800092458724976,
      "rewards/gemini_judge_reward_func/mean": 0.1049107164144516,
      "rewards/gemini_judge_reward_func/std": 0.21475397050380707,
      "rewards/semantic_correctness_reward_func/mean": 0.43866798281669617,
      "rewards/semantic_correctness_reward_func/std": 0.2061757743358612,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903892993927,
      "step": 900
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 548.0,
      "completions/mean_length": 152.12054443359375,
      "completions/mean_terminated_length": 144.26576232910156,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.30762664845717214,
      "grad_norm": 0.021649349480867386,
      "kl": 0.014882326126098633,
      "learning_rate": 3.7587088393030604e-07,
      "loss": 0.0046,
      "num_tokens": 321007971.0,
      "reward": 0.4334132671356201,
      "reward_std": 0.06561165302991867,
      "rewards/gemini_judge_reward_func/mean": 0.09375,
      "rewards/gemini_judge_reward_func/std": 0.20809029042720795,
      "rewards/semantic_correctness_reward_func/mean": 0.4065660834312439,
      "rewards/semantic_correctness_reward_func/std": 0.18776313960552216,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 901
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 752.0,
      "completions/mean_length": 170.97769165039062,
      "completions/mean_terminated_length": 143.46083068847656,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.30796807647987706,
      "grad_norm": 0.021009381860494614,
      "kl": 0.014513492584228516,
      "learning_rate": 3.6960653191218333e-07,
      "loss": -0.0344,
      "num_tokens": 321386086.0,
      "reward": 0.377293199300766,
      "reward_std": 0.05772269144654274,
      "rewards/gemini_judge_reward_func/mean": 0.0904017835855484,
      "rewards/gemini_judge_reward_func/std": 0.1935839205980301,
      "rewards/semantic_correctness_reward_func/mean": 0.38291215896606445,
      "rewards/semantic_correctness_reward_func/std": 0.17295578122138977,
      "rewards/xmlcount_reward_func/mean": 0.6613750457763672,
      "rewards/xmlcount_reward_func/std": 0.47500187158584595,
      "step": 902
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 177.37054443359375,
      "completions/mean_terminated_length": 150.05990600585938,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.30830950450258204,
      "grad_norm": 0.019631680101156235,
      "kl": 0.011731147766113281,
      "learning_rate": 3.6339281713517304e-07,
      "loss": -0.0512,
      "num_tokens": 321745689.0,
      "reward": 0.4260324239730835,
      "reward_std": 0.07654067873954773,
      "rewards/gemini_judge_reward_func/mean": 0.1238839253783226,
      "rewards/gemini_judge_reward_func/std": 0.24943633377552032,
      "rewards/semantic_correctness_reward_func/mean": 0.4166439175605774,
      "rewards/semantic_correctness_reward_func/std": 0.1954047828912735,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 903
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 745.0,
      "completions/mean_length": 170.52679443359375,
      "completions/mean_terminated_length": 151.0410919189453,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.308650932525287,
      "grad_norm": 0.02067430503666401,
      "kl": 0.012658357620239258,
      "learning_rate": 3.572298075514652e-07,
      "loss": 0.0104,
      "num_tokens": 322102323.0,
      "reward": 0.48799073696136475,
      "reward_std": 0.07766105234622955,
      "rewards/gemini_judge_reward_func/mean": 0.2243303507566452,
      "rewards/gemini_judge_reward_func/std": 0.32230180501937866,
      "rewards/semantic_correctness_reward_func/mean": 0.4897927939891815,
      "rewards/semantic_correctness_reward_func/std": 0.24560298025608063,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 904
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 336.0,
      "completions/mean_length": 150.11607360839844,
      "completions/mean_terminated_length": 138.25340270996094,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.30899236054799195,
      "grad_norm": 0.020890070125460625,
      "kl": 0.016417980194091797,
      "learning_rate": 3.511175705587433e-07,
      "loss": 0.0049,
      "num_tokens": 322448521.0,
      "reward": 0.45954275131225586,
      "reward_std": 0.061038125306367874,
      "rewards/gemini_judge_reward_func/mean": 0.1517857164144516,
      "rewards/gemini_judge_reward_func/std": 0.28382623195648193,
      "rewards/semantic_correctness_reward_func/mean": 0.4568919539451599,
      "rewards/semantic_correctness_reward_func/std": 0.22153709828853607,
      "rewards/xmlcount_reward_func/mean": 0.7686249613761902,
      "rewards/xmlcount_reward_func/std": 0.42356839776039124,
      "step": 905
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 951.0,
      "completions/mean_length": 164.74554443359375,
      "completions/mean_terminated_length": 149.1227264404297,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.30933378857069693,
      "grad_norm": 0.021773984655737877,
      "kl": 0.015863895416259766,
      "learning_rate": 3.450561729994534e-07,
      "loss": -0.0183,
      "num_tokens": 322804444.0,
      "reward": 0.41062042117118835,
      "reward_std": 0.06523442268371582,
      "rewards/gemini_judge_reward_func/mean": 0.1026785746216774,
      "rewards/gemini_judge_reward_func/std": 0.2259845733642578,
      "rewards/semantic_correctness_reward_func/mean": 0.45349493622779846,
      "rewards/semantic_correctness_reward_func/std": 0.21488092839717865,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903892993927,
      "step": 906
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 404.0,
      "completions/mean_length": 158.84375,
      "completions/mean_terminated_length": 147.09954833984375,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.3096752165934019,
      "grad_norm": 0.020778290927410126,
      "kl": 0.015050888061523438,
      "learning_rate": 3.390456811600673e-07,
      "loss": -0.0159,
      "num_tokens": 323163273.0,
      "reward": 0.4392737150192261,
      "reward_std": 0.07165578007698059,
      "rewards/gemini_judge_reward_func/mean": 0.1149553582072258,
      "rewards/gemini_judge_reward_func/std": 0.232961043715477,
      "rewards/semantic_correctness_reward_func/mean": 0.4113505184650421,
      "rewards/semantic_correctness_reward_func/std": 0.19492052495479584,
      "rewards/xmlcount_reward_func/mean": 0.7775535583496094,
      "rewards/xmlcount_reward_func/std": 0.417745977640152,
      "step": 907
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 296.0,
      "completions/mean_length": 147.1919708251953,
      "completions/mean_terminated_length": 143.26010131835938,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.31001664461610684,
      "grad_norm": 0.020663078874349594,
      "kl": 0.01226496696472168,
      "learning_rate": 3.3308616077036113e-07,
      "loss": 0.0115,
      "num_tokens": 323509436.0,
      "reward": 0.4209424555301666,
      "reward_std": 0.0577840618789196,
      "rewards/gemini_judge_reward_func/mean": 0.109375,
      "rewards/gemini_judge_reward_func/std": 0.22154484689235687,
      "rewards/semantic_correctness_reward_func/mean": 0.4202120304107666,
      "rewards/semantic_correctness_reward_func/std": 0.20789223909378052,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 908
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 449.0,
      "completions/mean_length": 157.61607360839844,
      "completions/mean_terminated_length": 145.85520935058594,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.3103580726388118,
      "grad_norm": 0.02018766477704048,
      "kl": 0.012340068817138672,
      "learning_rate": 3.271776770026963e-07,
      "loss": -0.0238,
      "num_tokens": 323846074.0,
      "reward": 0.43793949484825134,
      "reward_std": 0.04598357900977135,
      "rewards/gemini_judge_reward_func/mean": 0.0970982164144516,
      "rewards/gemini_judge_reward_func/std": 0.20721961557865143,
      "rewards/semantic_correctness_reward_func/mean": 0.3867507874965668,
      "rewards/semantic_correctness_reward_func/std": 0.20682360231876373,
      "rewards/xmlcount_reward_func/mean": 0.8043751120567322,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 909
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 364.0,
      "completions/mean_length": 152.21429443359375,
      "completions/mean_terminated_length": 144.36036682128906,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "epoch": 0.3106995006615168,
      "grad_norm": 0.019882354885339737,
      "kl": 0.012122154235839844,
      "learning_rate": 3.213202944713023e-07,
      "loss": 0.001,
      "num_tokens": 324217038.0,
      "reward": 0.43322232365608215,
      "reward_std": 0.05336981639266014,
      "rewards/gemini_judge_reward_func/mean": 0.1372767835855484,
      "rewards/gemini_judge_reward_func/std": 0.25469791889190674,
      "rewards/semantic_correctness_reward_func/mean": 0.4615578353404999,
      "rewards/semantic_correctness_reward_func/std": 0.2347070872783661,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 910
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 548.0,
      "completions/mean_length": 164.65179443359375,
      "completions/mean_terminated_length": 152.98643493652344,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.3110409286842218,
      "grad_norm": 0.01976594142615795,
      "kl": 0.011294364929199219,
      "learning_rate": 3.1551407723157734e-07,
      "loss": 0.0006,
      "num_tokens": 324552060.0,
      "reward": 0.4899020493030548,
      "reward_std": 0.06357000023126602,
      "rewards/gemini_judge_reward_func/mean": 0.1662946492433548,
      "rewards/gemini_judge_reward_func/std": 0.2800786793231964,
      "rewards/semantic_correctness_reward_func/mean": 0.481358140707016,
      "rewards/semantic_correctness_reward_func/std": 0.20851466059684753,
      "rewards/xmlcount_reward_func/mean": 0.8177813291549683,
      "rewards/xmlcount_reward_func/std": 0.3879494369029999,
      "step": 911
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 466.0,
      "completions/mean_length": 164.8125,
      "completions/mean_terminated_length": 153.14932250976562,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.3113823567069267,
      "grad_norm": 0.01866706646978855,
      "kl": 0.013501882553100586,
      "learning_rate": 3.0975908877938277e-07,
      "loss": 0.0139,
      "num_tokens": 324893338.0,
      "reward": 0.47727853059768677,
      "reward_std": 0.06310079991817474,
      "rewards/gemini_judge_reward_func/mean": 0.1763392835855484,
      "rewards/gemini_judge_reward_func/std": 0.2794100046157837,
      "rewards/semantic_correctness_reward_func/mean": 0.46071383357048035,
      "rewards/semantic_correctness_reward_func/std": 0.22256124019622803,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 912
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 347.0,
      "completions/mean_length": 172.96429443359375,
      "completions/mean_terminated_length": 149.54127502441406,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.3117237847296317,
      "grad_norm": 0.021747853606939316,
      "kl": 0.011886119842529297,
      "learning_rate": 3.040553920503503e-07,
      "loss": 0.0192,
      "num_tokens": 325232410.0,
      "reward": 0.449485719203949,
      "reward_std": 0.0661468431353569,
      "rewards/gemini_judge_reward_func/mean": 0.1171875,
      "rewards/gemini_judge_reward_func/std": 0.24131856858730316,
      "rewards/semantic_correctness_reward_func/mean": 0.4311159551143646,
      "rewards/semantic_correctness_reward_func/std": 0.2181245982646942,
      "rewards/xmlcount_reward_func/mean": 0.7909687757492065,
      "rewards/xmlcount_reward_func/std": 0.4050505757331848,
      "step": 913
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 596.0,
      "completions/mean_length": 173.61607360839844,
      "completions/mean_terminated_length": 150.21099853515625,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.3120652127523367,
      "grad_norm": 0.0281588863581419,
      "kl": 0.01603221893310547,
      "learning_rate": 2.984030494191942e-07,
      "loss": 0.0035,
      "num_tokens": 325576556.0,
      "reward": 0.47375205159187317,
      "reward_std": 0.06507349759340286,
      "rewards/gemini_judge_reward_func/mean": 0.1450892835855484,
      "rewards/gemini_judge_reward_func/std": 0.25965309143066406,
      "rewards/semantic_correctness_reward_func/mean": 0.46983152627944946,
      "rewards/semantic_correctness_reward_func/std": 0.21551068127155304,
      "rewards/xmlcount_reward_func/mean": 0.8043750524520874,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 914
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 466.0,
      "completions/mean_length": 151.9866180419922,
      "completions/mean_terminated_length": 136.13182067871094,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.3124066407750416,
      "grad_norm": 0.023043038323521614,
      "kl": 0.019103288650512695,
      "learning_rate": 2.928021226990263e-07,
      "loss": -0.0209,
      "num_tokens": 325931713.0,
      "reward": 0.4459460973739624,
      "reward_std": 0.058497704565525055,
      "rewards/gemini_judge_reward_func/mean": 0.140625,
      "rewards/gemini_judge_reward_func/std": 0.26844772696495056,
      "rewards/semantic_correctness_reward_func/mean": 0.44698023796081543,
      "rewards/semantic_correctness_reward_func/std": 0.2093106359243393,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 915
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 491.0,
      "completions/mean_length": 148.45089721679688,
      "completions/mean_terminated_length": 144.52467346191406,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.3127480687977466,
      "grad_norm": 0.021820751950144768,
      "kl": 0.01890873908996582,
      "learning_rate": 2.8725267314068496e-07,
      "loss": -0.0219,
      "num_tokens": 326264278.0,
      "reward": 0.424966424703598,
      "reward_std": 0.04606298357248306,
      "rewards/gemini_judge_reward_func/mean": 0.078125,
      "rewards/gemini_judge_reward_func/std": 0.18801312148571014,
      "rewards/semantic_correctness_reward_func/mean": 0.43133196234703064,
      "rewards/semantic_correctness_reward_func/std": 0.19531051814556122,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 916
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 943.0,
      "completions/mean_length": 162.04019165039062,
      "completions/mean_terminated_length": 146.36817932128906,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.31308949682045156,
      "grad_norm": 0.020182453095912933,
      "kl": 0.01574563980102539,
      "learning_rate": 2.817547614320615e-07,
      "loss": 0.0028,
      "num_tokens": 326636615.0,
      "reward": 0.4213365912437439,
      "reward_std": 0.0604686439037323,
      "rewards/gemini_judge_reward_func/mean": 0.1629464328289032,
      "rewards/gemini_judge_reward_func/std": 0.2845219075679779,
      "rewards/semantic_correctness_reward_func/mean": 0.4580489695072174,
      "rewards/semantic_correctness_reward_func/std": 0.2246883362531662,
      "rewards/xmlcount_reward_func/mean": 0.661370575428009,
      "rewards/xmlcount_reward_func/std": 0.47499868273735046,
      "step": 917
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 897.0,
      "completions/mean_length": 164.82589721679688,
      "completions/mean_terminated_length": 149.2045440673828,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.3134309248431565,
      "grad_norm": 0.0189261082559824,
      "kl": 0.017313480377197266,
      "learning_rate": 2.763084476974376e-07,
      "loss": 0.0112,
      "num_tokens": 327019208.0,
      "reward": 0.4103511869907379,
      "reward_std": 0.07770156115293503,
      "rewards/gemini_judge_reward_func/mean": 0.1261160671710968,
      "rewards/gemini_judge_reward_func/std": 0.2615041136741638,
      "rewards/semantic_correctness_reward_func/mean": 0.44102367758750916,
      "rewards/semantic_correctness_reward_func/std": 0.22343264520168304,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 918
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 513.0,
      "completions/mean_length": 172.2544708251953,
      "completions/mean_terminated_length": 148.81192016601562,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.31377235286586147,
      "grad_norm": 0.021766338497400284,
      "kl": 0.014467239379882812,
      "learning_rate": 2.7091379149682683e-07,
      "loss": -0.033,
      "num_tokens": 327396877.0,
      "reward": 0.40158364176750183,
      "reward_std": 0.054425518959760666,
      "rewards/gemini_judge_reward_func/mean": 0.0881696417927742,
      "rewards/gemini_judge_reward_func/std": 0.18727631866931915,
      "rewards/semantic_correctness_reward_func/mean": 0.4373288154602051,
      "rewards/semantic_correctness_reward_func/std": 0.19881200790405273,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903594970703,
      "step": 919
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 281.0,
      "completions/mean_length": 161.6294708251953,
      "completions/mean_terminated_length": 149.92308044433594,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.31411378088856645,
      "grad_norm": 0.020311061292886734,
      "kl": 0.01554250717163086,
      "learning_rate": 2.655708518253258e-07,
      "loss": -0.0004,
      "num_tokens": 327761186.0,
      "reward": 0.46613025665283203,
      "reward_std": 0.06654452532529831,
      "rewards/gemini_judge_reward_func/mean": 0.1171875,
      "rewards/gemini_judge_reward_func/std": 0.23184122145175934,
      "rewards/semantic_correctness_reward_func/mean": 0.4249636232852936,
      "rewards/semantic_correctness_reward_func/std": 0.2204969823360443,
      "rewards/xmlcount_reward_func/mean": 0.8356562852859497,
      "rewards/xmlcount_reward_func/std": 0.3725454807281494,
      "step": 920
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 616.0,
      "completions/mean_length": 160.41964721679688,
      "completions/mean_terminated_length": 140.70318603515625,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.3144552089112714,
      "grad_norm": 0.019982021301984787,
      "kl": 0.0161285400390625,
      "learning_rate": 2.602796871124663e-07,
      "loss": -0.0086,
      "num_tokens": 328124864.0,
      "reward": 0.40349289774894714,
      "reward_std": 0.08313170075416565,
      "rewards/gemini_judge_reward_func/mean": 0.1439732164144516,
      "rewards/gemini_judge_reward_func/std": 0.2586561143398285,
      "rewards/semantic_correctness_reward_func/mean": 0.43135711550712585,
      "rewards/semantic_correctness_reward_func/std": 0.21311675012111664,
      "rewards/xmlcount_reward_func/mean": 0.6490803956985474,
      "rewards/xmlcount_reward_func/std": 0.47694966197013855,
      "step": 921
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 402.0,
      "completions/mean_length": 158.6607208251953,
      "completions/mean_terminated_length": 146.91403198242188,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.31479663693397636,
      "grad_norm": 0.01935208961367607,
      "kl": 0.01622152328491211,
      "learning_rate": 2.5504035522157853e-07,
      "loss": -0.0028,
      "num_tokens": 328461724.0,
      "reward": 0.4391644597053528,
      "reward_std": 0.0541006401181221,
      "rewards/gemini_judge_reward_func/mean": 0.1205357164144516,
      "rewards/gemini_judge_reward_func/std": 0.24600426852703094,
      "rewards/semantic_correctness_reward_func/mean": 0.417500764131546,
      "rewards/semantic_correctness_reward_func/std": 0.20058931410312653,
      "rewards/xmlcount_reward_func/mean": 0.7686249613761902,
      "rewards/xmlcount_reward_func/std": 0.42356839776039124,
      "step": 922
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 278.0,
      "completions/mean_length": 154.89732360839844,
      "completions/mean_terminated_length": 147.06756591796875,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.31513806495668134,
      "grad_norm": 0.020716093480587006,
      "kl": 0.01778697967529297,
      "learning_rate": 2.4985291344915675e-07,
      "loss": 0.0092,
      "num_tokens": 328833597.0,
      "reward": 0.441595196723938,
      "reward_std": 0.06813618540763855,
      "rewards/gemini_judge_reward_func/mean": 0.1082589253783226,
      "rewards/gemini_judge_reward_func/std": 0.23377598822116852,
      "rewards/semantic_correctness_reward_func/mean": 0.4209490418434143,
      "rewards/semantic_correctness_reward_func/std": 0.21312111616134644,
      "rewards/xmlcount_reward_func/mean": 0.7852544784545898,
      "rewards/xmlcount_reward_func/std": 0.41142624616622925,
      "step": 923
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 302.0,
      "completions/mean_length": 157.35269165039062,
      "completions/mean_terminated_length": 141.59544372558594,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.31547949297938627,
      "grad_norm": 0.02100345492362976,
      "kl": 0.016232967376708984,
      "learning_rate": 2.447174185242324e-07,
      "loss": -0.0323,
      "num_tokens": 329175456.0,
      "reward": 0.4355000853538513,
      "reward_std": 0.06009732559323311,
      "rewards/gemini_judge_reward_func/mean": 0.1450892835855484,
      "rewards/gemini_judge_reward_func/std": 0.278405100107193,
      "rewards/semantic_correctness_reward_func/mean": 0.4573217034339905,
      "rewards/semantic_correctness_reward_func/std": 0.20487044751644135,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 924
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 827.0,
      "completions/mean_length": 159.29019165039062,
      "completions/mean_terminated_length": 147.5520477294922,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.31582092100209125,
      "grad_norm": 0.01990508660674095,
      "kl": 0.015584707260131836,
      "learning_rate": 2.3963392660775576e-07,
      "loss": 0.0066,
      "num_tokens": 329545373.0,
      "reward": 0.4258612096309662,
      "reward_std": 0.05928758531808853,
      "rewards/gemini_judge_reward_func/mean": 0.1595982164144516,
      "rewards/gemini_judge_reward_func/std": 0.3021363317966461,
      "rewards/semantic_correctness_reward_func/mean": 0.41585955023765564,
      "rewards/semantic_correctness_reward_func/std": 0.20218245685100555,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903892993927,
      "step": 925
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 526.0,
      "completions/mean_length": 171.37501525878906,
      "completions/mean_terminated_length": 143.8709716796875,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.31616234902479623,
      "grad_norm": 0.020098304376006126,
      "kl": 0.015714406967163086,
      "learning_rate": 2.3460249329197825e-07,
      "loss": 0.024,
      "num_tokens": 329909241.0,
      "reward": 0.4246380925178528,
      "reward_std": 0.0676727145910263,
      "rewards/gemini_judge_reward_func/mean": 0.1383928507566452,
      "rewards/gemini_judge_reward_func/std": 0.2524305284023285,
      "rewards/semantic_correctness_reward_func/mean": 0.4476812779903412,
      "rewards/semantic_correctness_reward_func/std": 0.21570633351802826,
      "rewards/xmlcount_reward_func/mean": 0.6993616819381714,
      "rewards/xmlcount_reward_func/std": 0.4566512405872345,
      "step": 926
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 292.0,
      "completions/mean_length": 168.4553680419922,
      "completions/mean_terminated_length": 144.9082489013672,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.31650377704750116,
      "grad_norm": 0.019722236320376396,
      "kl": 0.012249469757080078,
      "learning_rate": 2.296231735998511e-07,
      "loss": -0.02,
      "num_tokens": 330249435.0,
      "reward": 0.4518135190010071,
      "reward_std": 0.06725968420505524,
      "rewards/gemini_judge_reward_func/mean": 0.1484375,
      "rewards/gemini_judge_reward_func/std": 0.26996058225631714,
      "rewards/semantic_correctness_reward_func/mean": 0.424942284822464,
      "rewards/semantic_correctness_reward_func/std": 0.21814289689064026,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 927
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 571.0,
      "completions/mean_length": 158.0491180419922,
      "completions/mean_terminated_length": 150.2477569580078,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.31684520507020614,
      "grad_norm": 0.02072775363922119,
      "kl": 0.012631654739379883,
      "learning_rate": 2.2469602198441575e-07,
      "loss": 0.007,
      "num_tokens": 330590598.0,
      "reward": 0.4358016848564148,
      "reward_std": 0.06530667841434479,
      "rewards/gemini_judge_reward_func/mean": 0.1261160671710968,
      "rewards/gemini_judge_reward_func/std": 0.24027937650680542,
      "rewards/semantic_correctness_reward_func/mean": 0.3984636962413788,
      "rewards/semantic_correctness_reward_func/std": 0.20566481351852417,
      "rewards/xmlcount_reward_func/mean": 0.7641563415527344,
      "rewards/xmlcount_reward_func/std": 0.4263768792152405,
      "step": 928
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 686.0,
      "completions/mean_length": 169.29019165039062,
      "completions/mean_terminated_length": 149.7762451171875,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.3171866330929111,
      "grad_norm": 0.02100742794573307,
      "kl": 0.01338648796081543,
      "learning_rate": 2.198210923282118e-07,
      "loss": 0.0037,
      "num_tokens": 330960599.0,
      "reward": 0.46286967396736145,
      "reward_std": 0.07256618142127991,
      "rewards/gemini_judge_reward_func/mean": 0.1383928507566452,
      "rewards/gemini_judge_reward_func/std": 0.250200092792511,
      "rewards/semantic_correctness_reward_func/mean": 0.4306160509586334,
      "rewards/semantic_correctness_reward_func/std": 0.21207194030284882,
      "rewards/xmlcount_reward_func/mean": 0.803473174571991,
      "rewards/xmlcount_reward_func/std": 0.39635196328163147,
      "step": 929
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 491.0,
      "completions/mean_length": 169.4375,
      "completions/mean_terminated_length": 145.91741943359375,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.31752806111561604,
      "grad_norm": 0.019842946901917458,
      "kl": 0.013571023941040039,
      "learning_rate": 2.149984379426906e-07,
      "loss": -0.0122,
      "num_tokens": 331315505.0,
      "reward": 0.4548283517360687,
      "reward_std": 0.06730558723211288,
      "rewards/gemini_judge_reward_func/mean": 0.1439732164144516,
      "rewards/gemini_judge_reward_func/std": 0.2682309150695801,
      "rewards/semantic_correctness_reward_func/mean": 0.4489452540874481,
      "rewards/semantic_correctness_reward_func/std": 0.20901791751384735,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 930
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 328.0,
      "completions/mean_length": 156.74107360839844,
      "completions/mean_terminated_length": 144.9683380126953,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.317869489138321,
      "grad_norm": 0.019857613369822502,
      "kl": 0.014861583709716797,
      "learning_rate": 2.102281115676258e-07,
      "loss": -0.0096,
      "num_tokens": 331661999.0,
      "reward": 0.4678274691104889,
      "reward_std": 0.07853475958108902,
      "rewards/gemini_judge_reward_func/mean": 0.1529017835855484,
      "rewards/gemini_judge_reward_func/std": 0.2736615538597107,
      "rewards/semantic_correctness_reward_func/mean": 0.4782174527645111,
      "rewards/semantic_correctness_reward_func/std": 0.2053535431623459,
      "rewards/xmlcount_reward_func/mean": 0.7775580286979675,
      "rewards/xmlcount_reward_func/std": 0.41774842143058777,
      "step": 931
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 849.0,
      "completions/mean_length": 166.09375,
      "completions/mean_terminated_length": 150.49545288085938,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.318210917161026,
      "grad_norm": 0.019011061638593674,
      "kl": 0.012024164199829102,
      "learning_rate": 2.0551016537054492e-07,
      "loss": -0.0194,
      "num_tokens": 332014572.0,
      "reward": 0.4248262047767639,
      "reward_std": 0.06209308281540871,
      "rewards/gemini_judge_reward_func/mean": 0.1116071417927742,
      "rewards/gemini_judge_reward_func/std": 0.22295227646827698,
      "rewards/semantic_correctness_reward_func/mean": 0.41284507513046265,
      "rewards/semantic_correctness_reward_func/std": 0.1855388879776001,
      "rewards/xmlcount_reward_func/mean": 0.7440357208251953,
      "rewards/xmlcount_reward_func/std": 0.43177708983421326,
      "step": 932
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 871.0,
      "completions/mean_length": 165.08482360839844,
      "completions/mean_terminated_length": 149.46817016601562,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.31855234518373093,
      "grad_norm": 0.021508535370230675,
      "kl": 0.018635272979736328,
      "learning_rate": 2.008446509461498e-07,
      "loss": -0.015,
      "num_tokens": 332385227.0,
      "reward": 0.4379209876060486,
      "reward_std": 0.055039145052433014,
      "rewards/gemini_judge_reward_func/mean": 0.1383928507566452,
      "rewards/gemini_judge_reward_func/std": 0.22042377293109894,
      "rewards/semantic_correctness_reward_func/mean": 0.4470691978931427,
      "rewards/semantic_correctness_reward_func/std": 0.2077719122171402,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 933
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 917.0,
      "completions/mean_length": 151.96429443359375,
      "completions/mean_terminated_length": 148.05381774902344,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.3188937732064359,
      "grad_norm": 0.021709749475121498,
      "kl": 0.016178607940673828,
      "learning_rate": 1.962316193157593e-07,
      "loss": 0.0039,
      "num_tokens": 332752595.0,
      "reward": 0.44336017966270447,
      "reward_std": 0.07409544289112091,
      "rewards/gemini_judge_reward_func/mean": 0.1495535671710968,
      "rewards/gemini_judge_reward_func/std": 0.2739836871623993,
      "rewards/semantic_correctness_reward_func/mean": 0.436318576335907,
      "rewards/semantic_correctness_reward_func/std": 0.24862277507781982,
      "rewards/xmlcount_reward_func/mean": 0.7406874895095825,
      "rewards/xmlcount_reward_func/std": 0.4378414452075958,
      "step": 934
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 270.0,
      "completions/mean_length": 144.76339721679688,
      "completions/mean_terminated_length": 136.84234619140625,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.3192352012291409,
      "grad_norm": 0.02092774398624897,
      "kl": 0.015278339385986328,
      "learning_rate": 1.91671120926748e-07,
      "loss": -0.0262,
      "num_tokens": 333116046.0,
      "reward": 0.4193107783794403,
      "reward_std": 0.05722092092037201,
      "rewards/gemini_judge_reward_func/mean": 0.1294642835855484,
      "rewards/gemini_judge_reward_func/std": 0.24486233294010162,
      "rewards/semantic_correctness_reward_func/mean": 0.44337525963783264,
      "rewards/semantic_correctness_reward_func/std": 0.21482908725738525,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903594970703,
      "step": 935
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 318.0,
      "completions/mean_length": 162.9419708251953,
      "completions/mean_terminated_length": 151.25340270996094,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.3195766292518458,
      "grad_norm": 0.0197757575660944,
      "kl": 0.010863065719604492,
      "learning_rate": 1.871632056519962e-07,
      "loss": -0.0301,
      "num_tokens": 333433317.0,
      "reward": 0.49096450209617615,
      "reward_std": 0.07130220532417297,
      "rewards/gemini_judge_reward_func/mean": 0.1205357164144516,
      "rewards/gemini_judge_reward_func/std": 0.2295006811618805,
      "rewards/semantic_correctness_reward_func/mean": 0.4262508749961853,
      "rewards/semantic_correctness_reward_func/std": 0.22580935060977936,
      "rewards/xmlcount_reward_func/mean": 0.8937500715255737,
      "rewards/xmlcount_reward_func/std": 0.31029748916625977,
      "step": 936
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 487.0,
      "completions/mean_length": 154.36607360839844,
      "completions/mean_terminated_length": 142.56109619140625,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.3199180572745508,
      "grad_norm": 0.01896924152970314,
      "kl": 0.014213323593139648,
      "learning_rate": 1.8270792278934302e-07,
      "loss": 0.0075,
      "num_tokens": 333789967.0,
      "reward": 0.4371952414512634,
      "reward_std": 0.053081054240465164,
      "rewards/gemini_judge_reward_func/mean": 0.1462053507566452,
      "rewards/gemini_judge_reward_func/std": 0.26701533794403076,
      "rewards/semantic_correctness_reward_func/mean": 0.46356528997421265,
      "rewards/semantic_correctness_reward_func/std": 0.21834862232208252,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 937
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 301.0,
      "completions/mean_length": 163.6875,
      "completions/mean_terminated_length": 140.00917053222656,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.3202594852972558,
      "grad_norm": 0.021720534190535545,
      "kl": 0.012692689895629883,
      "learning_rate": 1.7830532106104747e-07,
      "loss": 0.0044,
      "num_tokens": 334150361.0,
      "reward": 0.4325564503669739,
      "reward_std": 0.0697154626250267,
      "rewards/gemini_judge_reward_func/mean": 0.1305803507566452,
      "rewards/gemini_judge_reward_func/std": 0.24824969470500946,
      "rewards/semantic_correctness_reward_func/mean": 0.435871422290802,
      "rewards/semantic_correctness_reward_func/std": 0.2043805867433548,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 938
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 563.0,
      "completions/mean_length": 159.69644165039062,
      "completions/mean_terminated_length": 143.9818115234375,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.3206009133199607,
      "grad_norm": 0.019939295947551727,
      "kl": 0.015135526657104492,
      "learning_rate": 1.7395544861325718e-07,
      "loss": 0.0161,
      "num_tokens": 334531525.0,
      "reward": 0.4231032729148865,
      "reward_std": 0.05319977179169655,
      "rewards/gemini_judge_reward_func/mean": 0.1272321492433548,
      "rewards/gemini_judge_reward_func/std": 0.24027156829833984,
      "rewards/semantic_correctness_reward_func/mean": 0.42881080508232117,
      "rewards/semantic_correctness_reward_func/std": 0.2193426787853241,
      "rewards/xmlcount_reward_func/mean": 0.7161206007003784,
      "rewards/xmlcount_reward_func/std": 0.4517506957054138,
      "step": 939
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 337.0,
      "completions/mean_length": 159.54464721679688,
      "completions/mean_terminated_length": 147.80996704101562,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.3209423413426657,
      "grad_norm": 0.020010868087410927,
      "kl": 0.012491226196289062,
      "learning_rate": 1.696583530154794e-07,
      "loss": 0.0005,
      "num_tokens": 334851831.0,
      "reward": 0.48991096019744873,
      "reward_std": 0.0713018923997879,
      "rewards/gemini_judge_reward_func/mean": 0.1428571492433548,
      "rewards/gemini_judge_reward_func/std": 0.23844198882579803,
      "rewards/semantic_correctness_reward_func/mean": 0.4210367202758789,
      "rewards/semantic_correctness_reward_func/std": 0.23552283644676208,
      "rewards/xmlcount_reward_func/mean": 0.8714017868041992,
      "rewards/xmlcount_reward_func/std": 0.3367997407913208,
      "step": 940
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 165.13839721679688,
      "completions/mean_terminated_length": 137.4331817626953,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.3212837693653707,
      "grad_norm": 0.02136445976793766,
      "kl": 0.017918109893798828,
      "learning_rate": 1.6541408126006464e-07,
      "loss": -0.0136,
      "num_tokens": 335240942.0,
      "reward": 0.36811578273773193,
      "reward_std": 0.06013471260666847,
      "rewards/gemini_judge_reward_func/mean": 0.1417410671710968,
      "rewards/gemini_judge_reward_func/std": 0.2652287185192108,
      "rewards/semantic_correctness_reward_func/mean": 0.444409042596817,
      "rewards/semantic_correctness_reward_func/std": 0.22162111103534698,
      "rewards/xmlcount_reward_func/mean": 0.5563437342643738,
      "rewards/xmlcount_reward_func/std": 0.49736082553863525,
      "step": 941
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 402.0,
      "completions/mean_length": 184.04019165039062,
      "completions/mean_terminated_length": 156.9447021484375,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.3216251973880756,
      "grad_norm": 0.01701374724507332,
      "kl": 0.012472152709960938,
      "learning_rate": 1.6122267976168783e-07,
      "loss": -0.0328,
      "num_tokens": 335593499.0,
      "reward": 0.425184041261673,
      "reward_std": 0.07142052799463272,
      "rewards/gemini_judge_reward_func/mean": 0.1484375,
      "rewards/gemini_judge_reward_func/std": 0.2593710422515869,
      "rewards/semantic_correctness_reward_func/mean": 0.4347950518131256,
      "rewards/semantic_correctness_reward_func/std": 0.2152114361524582,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903594970703,
      "step": 942
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 916.0,
      "completions/mean_length": 163.93304443359375,
      "completions/mean_terminated_length": 156.1846923828125,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.3219666254107806,
      "grad_norm": 0.020743004977703094,
      "kl": 0.013138771057128906,
      "learning_rate": 1.5708419435684463e-07,
      "loss": -0.05,
      "num_tokens": 335956992.0,
      "reward": 0.4332004487514496,
      "reward_std": 0.0775144100189209,
      "rewards/gemini_judge_reward_func/mean": 0.1283482164144516,
      "rewards/gemini_judge_reward_func/std": 0.25165361166000366,
      "rewards/semantic_correctness_reward_func/mean": 0.40780559182167053,
      "rewards/semantic_correctness_reward_func/std": 0.22031456232070923,
      "rewards/xmlcount_reward_func/mean": 0.7507500648498535,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 943
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 864.0,
      "completions/mean_length": 175.87501525878906,
      "completions/mean_terminated_length": 148.51612854003906,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.32230805343348556,
      "grad_norm": 0.019393648952245712,
      "kl": 0.0162045955657959,
      "learning_rate": 1.5299867030334815e-07,
      "loss": -0.0326,
      "num_tokens": 336327828.0,
      "reward": 0.38535192608833313,
      "reward_std": 0.06319523602724075,
      "rewards/gemini_judge_reward_func/mean": 0.0982142835855484,
      "rewards/gemini_judge_reward_func/std": 0.20871469378471375,
      "rewards/semantic_correctness_reward_func/mean": 0.4343844950199127,
      "rewards/semantic_correctness_reward_func/std": 0.1936558037996292,
      "rewards/xmlcount_reward_func/mean": 0.6479731798171997,
      "rewards/xmlcount_reward_func/std": 0.47700217366218567,
      "step": 944
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 889.0,
      "completions/mean_length": 165.7366180419922,
      "completions/mean_terminated_length": 154.0859832763672,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.3226494814561905,
      "grad_norm": 0.019887909293174744,
      "kl": 0.014321565628051758,
      "learning_rate": 1.4896615227983468e-07,
      "loss": -0.0364,
      "num_tokens": 336685281.0,
      "reward": 0.4396510124206543,
      "reward_std": 0.05946972966194153,
      "rewards/gemini_judge_reward_func/mean": 0.1183035746216774,
      "rewards/gemini_judge_reward_func/std": 0.2142873853445053,
      "rewards/semantic_correctness_reward_func/mean": 0.42439767718315125,
      "rewards/semantic_correctness_reward_func/std": 0.2012360543012619,
      "rewards/xmlcount_reward_func/mean": 0.7686250805854797,
      "rewards/xmlcount_reward_func/std": 0.42356836795806885,
      "step": 945
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 307.0,
      "completions/mean_length": 173.7232208251953,
      "completions/mean_terminated_length": 154.3105010986328,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.32299090947889547,
      "grad_norm": 0.020988894626498222,
      "kl": 0.011559724807739258,
      "learning_rate": 1.4498668438527597e-07,
      "loss": 0.0017,
      "num_tokens": 337010607.0,
      "reward": 0.4864182770252228,
      "reward_std": 0.060431286692619324,
      "rewards/gemini_judge_reward_func/mean": 0.1573660671710968,
      "rewards/gemini_judge_reward_func/std": 0.26588836312294006,
      "rewards/semantic_correctness_reward_func/mean": 0.4371090531349182,
      "rewards/semantic_correctness_reward_func/std": 0.2134704738855362,
      "rewards/xmlcount_reward_func/mean": 0.8401250243186951,
      "rewards/xmlcount_reward_func/std": 0.3684578835964203,
      "step": 946
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 410.0,
      "completions/mean_length": 160.3794708251953,
      "completions/mean_terminated_length": 156.50672912597656,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.32333233750160045,
      "grad_norm": 0.02066732570528984,
      "kl": 0.013165950775146484,
      "learning_rate": 1.4106031013849498e-07,
      "loss": -0.0018,
      "num_tokens": 337352116.0,
      "reward": 0.485461950302124,
      "reward_std": 0.05010446533560753,
      "rewards/gemini_judge_reward_func/mean": 0.1183035746216774,
      "rewards/gemini_judge_reward_func/std": 0.2538047432899475,
      "rewards/semantic_correctness_reward_func/mean": 0.4389525055885315,
      "rewards/semantic_correctness_reward_func/std": 0.21956577897071838,
      "rewards/xmlcount_reward_func/mean": 0.8758750557899475,
      "rewards/xmlcount_reward_func/std": 0.33179107308387756,
      "step": 947
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 415.0,
      "completions/mean_length": 159.58482360839844,
      "completions/mean_terminated_length": 147.85069274902344,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.32367376552430543,
      "grad_norm": 0.020279573276638985,
      "kl": 0.015435457229614258,
      "learning_rate": 1.3718707247769137e-07,
      "loss": -0.0023,
      "num_tokens": 337685867.0,
      "reward": 0.4606628715991974,
      "reward_std": 0.06019989401102066,
      "rewards/gemini_judge_reward_func/mean": 0.1428571492433548,
      "rewards/gemini_judge_reward_func/std": 0.25873589515686035,
      "rewards/semantic_correctness_reward_func/mean": 0.4445998966693878,
      "rewards/semantic_correctness_reward_func/std": 0.19706618785858154,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 948
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 997.0,
      "completions/mean_length": 163.2544708251953,
      "completions/mean_terminated_length": 151.5701446533203,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.32401519354701036,
      "grad_norm": 0.01981574296951294,
      "kl": 0.013592720031738281,
      "learning_rate": 1.333670137599713e-07,
      "loss": -0.0039,
      "num_tokens": 338015252.0,
      "reward": 0.436404287815094,
      "reward_std": 0.06201518699526787,
      "rewards/gemini_judge_reward_func/mean": 0.1439732164144516,
      "rewards/gemini_judge_reward_func/std": 0.25207090377807617,
      "rewards/semantic_correctness_reward_func/mean": 0.4283246397972107,
      "rewards/semantic_correctness_reward_func/std": 0.2121453732252121,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 949
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0357142857142857,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 917.0,
      "completions/mean_length": 189.2544708251953,
      "completions/mean_terminated_length": 158.3379669189453,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.32435662156971534,
      "grad_norm": 0.01981574296951294,
      "kl": 0.01743006706237793,
      "learning_rate": 1.333670137599713e-07,
      "loss": 0.0092,
      "num_tokens": 338382561.0,
      "reward": 0.3984151780605316,
      "reward_std": 0.04683025926351547,
      "rewards/gemini_judge_reward_func/mean": 0.1026785746216774,
      "rewards/gemini_judge_reward_func/std": 0.19681765139102936,
      "rewards/semantic_correctness_reward_func/mean": 0.41927212476730347,
      "rewards/semantic_correctness_reward_func/std": 0.20039360225200653,
      "rewards/xmlcount_reward_func/mean": 0.6837233304977417,
      "rewards/xmlcount_reward_func/std": 0.4643874168395996,
      "step": 950
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 404.0,
      "completions/mean_length": 157.4375,
      "completions/mean_terminated_length": 145.67420959472656,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.3246980495924203,
      "grad_norm": 0.019179075956344604,
      "kl": 0.011970043182373047,
      "learning_rate": 1.2960017576088445e-07,
      "loss": 0.0004,
      "num_tokens": 338736863.0,
      "reward": 0.4437229633331299,
      "reward_std": 0.07157056778669357,
      "rewards/gemini_judge_reward_func/mean": 0.1595982164144516,
      "rewards/gemini_judge_reward_func/std": 0.2656058371067047,
      "rewards/semantic_correctness_reward_func/mean": 0.43366822600364685,
      "rewards/semantic_correctness_reward_func/std": 0.21503110229969025,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 951
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 174.68304443359375,
      "completions/mean_terminated_length": 151.3073272705078,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.32503947761512525,
      "grad_norm": 0.019320348277688026,
      "kl": 0.017798185348510742,
      "learning_rate": 1.2588659967396998e-07,
      "loss": -0.0043,
      "num_tokens": 339097528.0,
      "reward": 0.4750409424304962,
      "reward_std": 0.08295747637748718,
      "rewards/gemini_judge_reward_func/mean": 0.1796875,
      "rewards/gemini_judge_reward_func/std": 0.29441800713539124,
      "rewards/semantic_correctness_reward_func/mean": 0.478579580783844,
      "rewards/semantic_correctness_reward_func/std": 0.2249763160943985,
      "rewards/xmlcount_reward_func/mean": 0.7686249613761902,
      "rewards/xmlcount_reward_func/std": 0.42356839776039124,
      "step": 952
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 550.0,
      "completions/mean_length": 168.25894165039062,
      "completions/mean_terminated_length": 148.72145080566406,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.32538090563783023,
      "grad_norm": 0.0192513857036829,
      "kl": 0.014150142669677734,
      "learning_rate": 1.222263261102985e-07,
      "loss": -0.0178,
      "num_tokens": 339463978.0,
      "reward": 0.43717771768569946,
      "reward_std": 0.06090007722377777,
      "rewards/gemini_judge_reward_func/mean": 0.1294642835855484,
      "rewards/gemini_judge_reward_func/std": 0.23313555121421814,
      "rewards/semantic_correctness_reward_func/mean": 0.4477901756763458,
      "rewards/semantic_correctness_reward_func/std": 0.18037478625774384,
      "rewards/xmlcount_reward_func/mean": 0.7395849227905273,
      "rewards/xmlcount_reward_func/std": 0.4368475377559662,
      "step": 953
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0357142857142857,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 556.0,
      "completions/mean_length": 183.24554443359375,
      "completions/mean_terminated_length": 152.10647583007812,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.3257223336605352,
      "grad_norm": 0.019445307552814484,
      "kl": 0.013372421264648438,
      "learning_rate": 1.1861939509803688e-07,
      "loss": -0.0261,
      "num_tokens": 339827497.0,
      "reward": 0.4099394977092743,
      "reward_std": 0.05597153678536415,
      "rewards/gemini_judge_reward_func/mean": 0.0870535746216774,
      "rewards/gemini_judge_reward_func/std": 0.1914946585893631,
      "rewards/semantic_correctness_reward_func/mean": 0.4098401665687561,
      "rewards/semantic_correctness_reward_func/std": 0.17344339191913605,
      "rewards/xmlcount_reward_func/mean": 0.7328750491142273,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 954
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 429.0,
      "completions/mean_length": 165.41964721679688,
      "completions/mean_terminated_length": 141.7889862060547,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.32606376168324014,
      "grad_norm": 0.01958622597157955,
      "kl": 0.012957572937011719,
      "learning_rate": 1.1506584608200366e-07,
      "loss": 0.0071,
      "num_tokens": 340206199.0,
      "reward": 0.4157373011112213,
      "reward_std": 0.062380947172641754,
      "rewards/gemini_judge_reward_func/mean": 0.1316964328289032,
      "rewards/gemini_judge_reward_func/std": 0.2482219636440277,
      "rewards/semantic_correctness_reward_func/mean": 0.42104366421699524,
      "rewards/semantic_correctness_reward_func/std": 0.193641796708107,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903892993927,
      "step": 955
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 529.0,
      "completions/max_terminated_length": 529.0,
      "completions/mean_length": 153.5357208251953,
      "completions/mean_terminated_length": 153.5357208251953,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.3264051897059451,
      "grad_norm": 0.0198379959911108,
      "kl": 0.014824390411376953,
      "learning_rate": 1.1156571792324212e-07,
      "loss": -0.0188,
      "num_tokens": 340538411.0,
      "reward": 0.4334469139575958,
      "reward_std": 0.06941147148609161,
      "rewards/gemini_judge_reward_func/mean": 0.1183035746216774,
      "rewards/gemini_judge_reward_func/std": 0.2294461578130722,
      "rewards/semantic_correctness_reward_func/mean": 0.4291272759437561,
      "rewards/semantic_correctness_reward_func/std": 0.22070704400539398,
      "rewards/xmlcount_reward_func/mean": 0.7507500648498535,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 956
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 702.0,
      "completions/mean_length": 144.6116180419922,
      "completions/mean_terminated_length": 140.6681671142578,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.3267466177286501,
      "grad_norm": 0.020753346383571625,
      "kl": 0.015350341796875,
      "learning_rate": 1.0811904889859337e-07,
      "loss": 0.0,
      "num_tokens": 340899920.0,
      "reward": 0.44371187686920166,
      "reward_std": 0.0667320117354393,
      "rewards/gemini_judge_reward_func/mean": 0.1540178507566452,
      "rewards/gemini_judge_reward_func/std": 0.2894744277000427,
      "rewards/semantic_correctness_reward_func/mean": 0.4805235266685486,
      "rewards/semantic_correctness_reward_func/std": 0.20522719621658325,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 957
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1016.0,
      "completions/mean_length": 160.96429443359375,
      "completions/mean_terminated_length": 149.24887084960938,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.327088045751355,
      "grad_norm": 0.02153160236775875,
      "kl": 0.020434141159057617,
      "learning_rate": 1.0472587670027678e-07,
      "loss": 0.0197,
      "num_tokens": 341271724.0,
      "reward": 0.44719889760017395,
      "reward_std": 0.08111313730478287,
      "rewards/gemini_judge_reward_func/mean": 0.1629464328289032,
      "rewards/gemini_judge_reward_func/std": 0.2903720736503601,
      "rewards/semantic_correctness_reward_func/mean": 0.46769067645072937,
      "rewards/semantic_correctness_reward_func/std": 0.23603184521198273,
      "rewards/xmlcount_reward_func/mean": 0.7212054133415222,
      "rewards/xmlcount_reward_func/std": 0.4433631896972656,
      "step": 958
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 605.0,
      "completions/mean_length": 162.07144165039062,
      "completions/mean_terminated_length": 154.30630493164062,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.32742947377406,
      "grad_norm": 0.01959027163684368,
      "kl": 0.011683225631713867,
      "learning_rate": 1.0138623843548078e-07,
      "loss": -0.0229,
      "num_tokens": 341623200.0,
      "reward": 0.47252076864242554,
      "reward_std": 0.07059833407402039,
      "rewards/gemini_judge_reward_func/mean": 0.1395089328289032,
      "rewards/gemini_judge_reward_func/std": 0.250137597322464,
      "rewards/semantic_correctness_reward_func/mean": 0.44802334904670715,
      "rewards/semantic_correctness_reward_func/std": 0.1806076020002365,
      "rewards/xmlcount_reward_func/mean": 0.8177813291549683,
      "rewards/xmlcount_reward_func/std": 0.3879494369029999,
      "step": 959
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 147.3303680419922,
      "completions/mean_terminated_length": 139.43243408203125,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.327770901796765,
      "grad_norm": 0.022266387939453125,
      "kl": 0.018918991088867188,
      "learning_rate": 9.810017062595322e-08,
      "loss": -0.013,
      "num_tokens": 341993126.0,
      "reward": 0.4521217942237854,
      "reward_std": 0.07054702937602997,
      "rewards/gemini_judge_reward_func/mean": 0.1863839328289032,
      "rewards/gemini_judge_reward_func/std": 0.288268119096756,
      "rewards/semantic_correctness_reward_func/mean": 0.4935908019542694,
      "rewards/semantic_correctness_reward_func/std": 0.23850572109222412,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903594970703,
      "step": 960
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 810.0,
      "completions/mean_length": 161.08482360839844,
      "completions/mean_terminated_length": 145.39544677734375,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.3281123298194699,
      "grad_norm": 0.020122205838561058,
      "kl": 0.014005899429321289,
      "learning_rate": 9.486770920760668e-08,
      "loss": -0.015,
      "num_tokens": 342363049.0,
      "reward": 0.4218449890613556,
      "reward_std": 0.06150934845209122,
      "rewards/gemini_judge_reward_func/mean": 0.1372767835855484,
      "rewards/gemini_judge_reward_func/std": 0.2798641622066498,
      "rewards/semantic_correctness_reward_func/mean": 0.43818897008895874,
      "rewards/semantic_correctness_reward_func/std": 0.22415503859519958,
      "rewards/xmlcount_reward_func/mean": 0.698241114616394,
      "rewards/xmlcount_reward_func/std": 0.4598964750766754,
      "step": 961
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0357142857142857,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1013.0,
      "completions/mean_length": 181.9241180419922,
      "completions/mean_terminated_length": 150.73611450195312,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.3284537578421749,
      "grad_norm": 0.026615537703037262,
      "kl": 0.02645087242126465,
      "learning_rate": 9.16888895301199e-08,
      "loss": 0.0051,
      "num_tokens": 342726040.0,
      "reward": 0.43222251534461975,
      "reward_std": 0.07855530083179474,
      "rewards/gemini_judge_reward_func/mean": 0.1964285671710968,
      "rewards/gemini_judge_reward_func/std": 0.29840490221977234,
      "rewards/semantic_correctness_reward_func/mean": 0.4771032929420471,
      "rewards/semantic_correctness_reward_func/std": 0.2208947241306305,
      "rewards/xmlcount_reward_func/mean": 0.6455759406089783,
      "rewards/xmlcount_reward_func/std": 0.4838610589504242,
      "step": 962
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 461.0,
      "completions/mean_length": 162.92857360839844,
      "completions/mean_terminated_length": 147.27272033691406,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.3287951858648799,
      "grad_norm": 0.024194782599806786,
      "kl": 0.014604568481445312,
      "learning_rate": 8.856374635655696e-08,
      "loss": -0.0229,
      "num_tokens": 343096860.0,
      "reward": 0.4131552577018738,
      "reward_std": 0.05632089450955391,
      "rewards/gemini_judge_reward_func/mean": 0.1205357164144516,
      "rewards/gemini_judge_reward_func/std": 0.23313553631305695,
      "rewards/semantic_correctness_reward_func/mean": 0.4044993221759796,
      "rewards/semantic_correctness_reward_func/std": 0.21371452510356903,
      "rewards/xmlcount_reward_func/mean": 0.7101027369499207,
      "rewards/xmlcount_reward_func/std": 0.45508646965026855,
      "step": 963
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 640.0,
      "completions/mean_length": 157.8169708251953,
      "completions/mean_terminated_length": 138.0410919189453,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.3291366138875848,
      "grad_norm": 0.020719485357403755,
      "kl": 0.017660140991210938,
      "learning_rate": 8.549231386298151e-08,
      "loss": 0.0174,
      "num_tokens": 343443715.0,
      "reward": 0.42168301343917847,
      "reward_std": 0.05975125730037689,
      "rewards/gemini_judge_reward_func/mean": 0.1439732164144516,
      "rewards/gemini_judge_reward_func/std": 0.2661329209804535,
      "rewards/semantic_correctness_reward_func/mean": 0.46196863055229187,
      "rewards/semantic_correctness_reward_func/std": 0.20193101465702057,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 964
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 403.0,
      "completions/mean_length": 159.6741180419922,
      "completions/mean_terminated_length": 151.88739013671875,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.3294780419102898,
      "grad_norm": 0.020591214299201965,
      "kl": 0.015319347381591797,
      "learning_rate": 8.247462563808816e-08,
      "loss": -0.0066,
      "num_tokens": 343756414.0,
      "reward": 0.44394856691360474,
      "reward_std": 0.059848301112651825,
      "rewards/gemini_judge_reward_func/mean": 0.1082589253783226,
      "rewards/gemini_judge_reward_func/std": 0.19306614995002747,
      "rewards/semantic_correctness_reward_func/mean": 0.43022483587265015,
      "rewards/semantic_correctness_reward_func/std": 0.2131272703409195,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 965
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 927.0,
      "completions/mean_length": 166.80804443359375,
      "completions/mean_terminated_length": 151.22271728515625,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.32981946993299477,
      "grad_norm": 0.02128966711461544,
      "kl": 0.01815199851989746,
      "learning_rate": 7.951071468283166e-08,
      "loss": -0.0017,
      "num_tokens": 344126527.0,
      "reward": 0.40433269739151,
      "reward_std": 0.07133690267801285,
      "rewards/gemini_judge_reward_func/mean": 0.1127232164144516,
      "rewards/gemini_judge_reward_func/std": 0.22301679849624634,
      "rewards/semantic_correctness_reward_func/mean": 0.4734667241573334,
      "rewards/semantic_correctness_reward_func/std": 0.20783737301826477,
      "rewards/xmlcount_reward_func/mean": 0.6613750457763672,
      "rewards/xmlcount_reward_func/std": 0.47500187158584595,
      "step": 966
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 538.0,
      "completions/mean_length": 170.6116180419922,
      "completions/mean_terminated_length": 151.1278533935547,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.3301608979556997,
      "grad_norm": 0.01904473453760147,
      "kl": 0.013091087341308594,
      "learning_rate": 7.660061341006719e-08,
      "loss": -0.0302,
      "num_tokens": 344479128.0,
      "reward": 0.43435797095298767,
      "reward_std": 0.05595193803310394,
      "rewards/gemini_judge_reward_func/mean": 0.125,
      "rewards/gemini_judge_reward_func/std": 0.25278717279434204,
      "rewards/semantic_correctness_reward_func/mean": 0.44712895154953003,
      "rewards/semantic_correctness_reward_func/std": 0.22964619100093842,
      "rewards/xmlcount_reward_func/mean": 0.737330436706543,
      "rewards/xmlcount_reward_func/std": 0.439359575510025,
      "step": 967
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 900.0,
      "completions/mean_length": 159.1919708251953,
      "completions/mean_terminated_length": 151.40090942382812,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.3305023259784047,
      "grad_norm": 0.01983231119811535,
      "kl": 0.011693239212036133,
      "learning_rate": 7.374435364419675e-08,
      "loss": -0.0318,
      "num_tokens": 344824143.0,
      "reward": 0.4040999114513397,
      "reward_std": 0.048461802303791046,
      "rewards/gemini_judge_reward_func/mean": 0.1037946417927742,
      "rewards/gemini_judge_reward_func/std": 0.20393303036689758,
      "rewards/semantic_correctness_reward_func/mean": 0.41866016387939453,
      "rewards/semantic_correctness_reward_func/std": 0.20052829384803772,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903892993927,
      "step": 968
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 329.0,
      "completions/mean_length": 152.16519165039062,
      "completions/mean_terminated_length": 140.330322265625,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.33084375400110966,
      "grad_norm": 0.020666640251874924,
      "kl": 0.014385223388671875,
      "learning_rate": 7.094196662081832e-08,
      "loss": -0.0165,
      "num_tokens": 345193180.0,
      "reward": 0.4273673892021179,
      "reward_std": 0.06679557263851166,
      "rewards/gemini_judge_reward_func/mean": 0.1417410671710968,
      "rewards/gemini_judge_reward_func/std": 0.24774518609046936,
      "rewards/semantic_correctness_reward_func/mean": 0.4233546555042267,
      "rewards/semantic_correctness_reward_func/std": 0.18689891695976257,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 969
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 633.0,
      "completions/mean_length": 151.37054443359375,
      "completions/mean_terminated_length": 147.45741271972656,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.3311851820238146,
      "grad_norm": 0.020662736147642136,
      "kl": 0.012186050415039062,
      "learning_rate": 6.819348298638839e-08,
      "loss": 0.0109,
      "num_tokens": 345555479.0,
      "reward": 0.422772616147995,
      "reward_std": 0.0710143893957138,
      "rewards/gemini_judge_reward_func/mean": 0.1194196417927742,
      "rewards/gemini_judge_reward_func/std": 0.2331113964319229,
      "rewards/semantic_correctness_reward_func/mean": 0.4271486699581146,
      "rewards/semantic_correctness_reward_func/std": 0.2294951230287552,
      "rewards/xmlcount_reward_func/mean": 0.7239375710487366,
      "rewards/xmlcount_reward_func/std": 0.4488601088523865,
      "step": 970
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 409.0,
      "completions/mean_length": 161.2991180419922,
      "completions/mean_terminated_length": 149.58824157714844,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.33152661004651957,
      "grad_norm": 0.01932770572602749,
      "kl": 0.012654304504394531,
      "learning_rate": 6.549893279788278e-08,
      "loss": -0.0002,
      "num_tokens": 345886710.0,
      "reward": 0.4679696559906006,
      "reward_std": 0.0628255307674408,
      "rewards/gemini_judge_reward_func/mean": 0.1339285671710968,
      "rewards/gemini_judge_reward_func/std": 0.26455092430114746,
      "rewards/semantic_correctness_reward_func/mean": 0.44534817337989807,
      "rewards/semantic_correctness_reward_func/std": 0.19571352005004883,
      "rewards/xmlcount_reward_func/mean": 0.8133214712142944,
      "rewards/xmlcount_reward_func/std": 0.3857904076576233,
      "step": 971
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 432.0,
      "completions/mean_length": 172.18751525878906,
      "completions/mean_terminated_length": 160.6244354248047,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.33186803806922455,
      "grad_norm": 0.02037588320672512,
      "kl": 0.011893272399902344,
      "learning_rate": 6.285834552247127e-08,
      "loss": -0.0302,
      "num_tokens": 346223572.0,
      "reward": 0.4898928105831146,
      "reward_std": 0.0647522360086441,
      "rewards/gemini_judge_reward_func/mean": 0.1305803507566452,
      "rewards/gemini_judge_reward_func/std": 0.22947613894939423,
      "rewards/semantic_correctness_reward_func/mean": 0.4365532100200653,
      "rewards/semantic_correctness_reward_func/std": 0.18644456565380096,
      "rewards/xmlcount_reward_func/mean": 0.8758750557899475,
      "rewards/xmlcount_reward_func/std": 0.33179107308387756,
      "step": 972
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 878.0,
      "completions/mean_length": 164.3794708251953,
      "completions/mean_terminated_length": 140.72018432617188,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.3322094660919295,
      "grad_norm": 0.027246547862887383,
      "kl": 0.01928424835205078,
      "learning_rate": 6.027175003719354e-08,
      "loss": -0.0297,
      "num_tokens": 346613681.0,
      "reward": 0.34606069326400757,
      "reward_std": 0.04251888021826744,
      "rewards/gemini_judge_reward_func/mean": 0.0491071417927742,
      "rewards/gemini_judge_reward_func/std": 0.13325557112693787,
      "rewards/semantic_correctness_reward_func/mean": 0.3785982131958008,
      "rewards/semantic_correctness_reward_func/std": 0.20065419375896454,
      "rewards/xmlcount_reward_func/mean": 0.6267456412315369,
      "rewards/xmlcount_reward_func/std": 0.4845307171344757,
      "step": 973
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 398.0,
      "completions/mean_length": 156.5178680419922,
      "completions/mean_terminated_length": 136.7123260498047,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.33255089411463445,
      "grad_norm": 0.02008494734764099,
      "kl": 0.01699686050415039,
      "learning_rate": 5.773917462864265e-08,
      "loss": -0.0347,
      "num_tokens": 347005969.0,
      "reward": 0.38510963320732117,
      "reward_std": 0.051969029009342194,
      "rewards/gemini_judge_reward_func/mean": 0.0892857164144516,
      "rewards/gemini_judge_reward_func/std": 0.21270503103733063,
      "rewards/semantic_correctness_reward_func/mean": 0.38847652077674866,
      "rewards/semantic_correctness_reward_func/std": 0.1963837593793869,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 974
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0357142857142857,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 497.0,
      "completions/mean_length": 175.02232360839844,
      "completions/mean_terminated_length": 143.57870483398438,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.33289232213733944,
      "grad_norm": 0.019948428496718407,
      "kl": 0.01439356803894043,
      "learning_rate": 5.526064699265754e-08,
      "loss": -0.0112,
      "num_tokens": 347383882.0,
      "reward": 0.3975110352039337,
      "reward_std": 0.06101817265152931,
      "rewards/gemini_judge_reward_func/mean": 0.1238839253783226,
      "rewards/gemini_judge_reward_func/std": 0.269942045211792,
      "rewards/semantic_correctness_reward_func/mean": 0.41703715920448303,
      "rewards/semantic_correctness_reward_func/std": 0.25031930208206177,
      "rewards/xmlcount_reward_func/mean": 0.6613750457763672,
      "rewards/xmlcount_reward_func/std": 0.47500187158584595,
      "step": 975
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 826.0,
      "completions/mean_length": 175.96876525878906,
      "completions/mean_terminated_length": 156.6072998046875,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.33323375016004436,
      "grad_norm": 0.020137697458267212,
      "kl": 0.014559745788574219,
      "learning_rate": 5.2836194234019976e-08,
      "loss": 0.0183,
      "num_tokens": 347752771.0,
      "reward": 0.4365932047367096,
      "reward_std": 0.07334822416305542,
      "rewards/gemini_judge_reward_func/mean": 0.1395089328289032,
      "rewards/gemini_judge_reward_func/std": 0.2611019015312195,
      "rewards/semantic_correctness_reward_func/mean": 0.4761890470981598,
      "rewards/semantic_correctness_reward_func/std": 0.20910099148750305,
      "rewards/xmlcount_reward_func/mean": 0.7138795256614685,
      "rewards/xmlcount_reward_func/std": 0.452818363904953,
      "step": 976
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 340.0,
      "completions/mean_length": 156.1294708251953,
      "completions/mean_terminated_length": 144.34841918945312,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.33357517818274934,
      "grad_norm": 0.018826456740498543,
      "kl": 0.01526784896850586,
      "learning_rate": 5.0465842866156965e-08,
      "loss": -0.0179,
      "num_tokens": 348123360.0,
      "reward": 0.41452330350875854,
      "reward_std": 0.0659627914428711,
      "rewards/gemini_judge_reward_func/mean": 0.1361607164144516,
      "rewards/gemini_judge_reward_func/std": 0.24692820012569427,
      "rewards/semantic_correctness_reward_func/mean": 0.4417950510978699,
      "rewards/semantic_correctness_reward_func/std": 0.21186378598213196,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 977
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 391.0,
      "completions/max_terminated_length": 391.0,
      "completions/mean_length": 141.6116180419922,
      "completions/mean_terminated_length": 141.6116180419922,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.3339166062054543,
      "grad_norm": 0.021350812166929245,
      "kl": 0.014334440231323242,
      "learning_rate": 4.8149618810850454e-08,
      "loss": 0.0309,
      "num_tokens": 348474477.0,
      "reward": 0.44549697637557983,
      "reward_std": 0.07247848808765411,
      "rewards/gemini_judge_reward_func/mean": 0.1015625,
      "rewards/gemini_judge_reward_func/std": 0.23199227452278137,
      "rewards/semantic_correctness_reward_func/mean": 0.41560983657836914,
      "rewards/semantic_correctness_reward_func/std": 0.2107170671224594,
      "rewards/xmlcount_reward_func/mean": 0.8043750524520874,
      "rewards/xmlcount_reward_func/std": 0.3985843360424042,
      "step": 978
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 952.0,
      "completions/mean_length": 177.31251525878906,
      "completions/mean_terminated_length": 150.0,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.33425803422815925,
      "grad_norm": 0.020839158445596695,
      "kl": 0.013498067855834961,
      "learning_rate": 4.588754739795587e-08,
      "loss": -0.0097,
      "num_tokens": 348839815.0,
      "reward": 0.43120020627975464,
      "reward_std": 0.057902269065380096,
      "rewards/gemini_judge_reward_func/mean": 0.0993303582072258,
      "rewards/gemini_judge_reward_func/std": 0.197829008102417,
      "rewards/semantic_correctness_reward_func/mean": 0.4558401107788086,
      "rewards/semantic_correctness_reward_func/std": 0.19552071392536163,
      "rewards/xmlcount_reward_func/mean": 0.7507500648498535,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 979
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 954.0,
      "completions/mean_length": 159.4419708251953,
      "completions/mean_terminated_length": 147.7058868408203,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.33459946225086423,
      "grad_norm": 0.0211932510137558,
      "kl": 0.014786720275878906,
      "learning_rate": 4.367965336512403e-08,
      "loss": -0.0091,
      "num_tokens": 349205298.0,
      "reward": 0.39420926570892334,
      "reward_std": 0.061161503195762634,
      "rewards/gemini_judge_reward_func/mean": 0.1171875,
      "rewards/gemini_judge_reward_func/std": 0.24705736339092255,
      "rewards/semantic_correctness_reward_func/mean": 0.37817126512527466,
      "rewards/semantic_correctness_reward_func/std": 0.22185632586479187,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 980
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 846.0,
      "completions/mean_length": 175.5491180419922,
      "completions/mean_terminated_length": 152.19723510742188,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.3349408902735692,
      "grad_norm": 0.020670127123594284,
      "kl": 0.01729416847229004,
      "learning_rate": 4.1525960857530244e-08,
      "loss": 0.0071,
      "num_tokens": 349555813.0,
      "reward": 0.44997820258140564,
      "reward_std": 0.06321458518505096,
      "rewards/gemini_judge_reward_func/mean": 0.1584821492433548,
      "rewards/gemini_judge_reward_func/std": 0.260422945022583,
      "rewards/semantic_correctness_reward_func/mean": 0.44487300515174866,
      "rewards/semantic_correctness_reward_func/std": 0.21552570164203644,
      "rewards/xmlcount_reward_func/mean": 0.7440268397331238,
      "rewards/xmlcount_reward_func/std": 0.4337250292301178,
      "step": 981
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 302.0,
      "completions/mean_length": 173.38839721679688,
      "completions/mean_terminated_length": 149.97705078125,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.33528231829627414,
      "grad_norm": 0.0203064177185297,
      "kl": 0.0137939453125,
      "learning_rate": 3.9426493427611177e-08,
      "loss": 0.0006,
      "num_tokens": 349903064.0,
      "reward": 0.4368150234222412,
      "reward_std": 0.061177946627140045,
      "rewards/gemini_judge_reward_func/mean": 0.1216517835855484,
      "rewards/gemini_judge_reward_func/std": 0.23194913566112518,
      "rewards/semantic_correctness_reward_func/mean": 0.4392712414264679,
      "rewards/semantic_correctness_reward_func/std": 0.22072644531726837,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 982
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 338.0,
      "completions/mean_length": 166.4107208251953,
      "completions/mean_terminated_length": 146.83103942871094,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.3356237463189791,
      "grad_norm": 0.01848675310611725,
      "kl": 0.012476444244384766,
      "learning_rate": 3.738127403480507e-08,
      "loss": -0.0361,
      "num_tokens": 350265044.0,
      "reward": 0.4574826657772064,
      "reward_std": 0.08490362018346786,
      "rewards/gemini_judge_reward_func/mean": 0.1629464328289032,
      "rewards/gemini_judge_reward_func/std": 0.2894052565097809,
      "rewards/semantic_correctness_reward_func/mean": 0.4600202143192291,
      "rewards/semantic_correctness_reward_func/std": 0.21947798132896423,
      "rewards/xmlcount_reward_func/mean": 0.7507500648498535,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 983
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.013392857142857095,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 278.0,
      "completions/mean_length": 160.2678680419922,
      "completions/mean_terminated_length": 148.54299926757812,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.3359651743416841,
      "grad_norm": 0.018878811970353127,
      "kl": 0.012603759765625,
      "learning_rate": 3.5390325045304704e-08,
      "loss": -0.005,
      "num_tokens": 350595932.0,
      "reward": 0.44260725378990173,
      "reward_std": 0.06230099871754646,
      "rewards/gemini_judge_reward_func/mean": 0.1127232164144516,
      "rewards/gemini_judge_reward_func/std": 0.24113184213638306,
      "rewards/semantic_correctness_reward_func/mean": 0.41458967328071594,
      "rewards/semantic_correctness_reward_func/std": 0.1906474530696869,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 984
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 453.0,
      "completions/mean_length": 147.80804443359375,
      "completions/mean_terminated_length": 143.87893676757812,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.3363066023643891,
      "grad_norm": 0.020969383418560028,
      "kl": 0.016061782836914062,
      "learning_rate": 3.345366823180929e-08,
      "loss": 0.0002,
      "num_tokens": 350964901.0,
      "reward": 0.4793672561645508,
      "reward_std": 0.06915397942066193,
      "rewards/gemini_judge_reward_func/mean": 0.1529017835855484,
      "rewards/gemini_judge_reward_func/std": 0.276716947555542,
      "rewards/semantic_correctness_reward_func/mean": 0.4465325176715851,
      "rewards/semantic_correctness_reward_func/std": 0.23200318217277527,
      "rewards/xmlcount_reward_func/mean": 0.8222500681877136,
      "rewards/xmlcount_reward_func/std": 0.3842346966266632,
      "step": 985
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 820.0,
      "completions/mean_length": 153.09375,
      "completions/mean_terminated_length": 145.2477569580078,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.336648030387094,
      "grad_norm": 0.019626779481768608,
      "kl": 0.012929677963256836,
      "learning_rate": 3.1571324773286284e-08,
      "loss": 0.0086,
      "num_tokens": 351329478.0,
      "reward": 0.47190842032432556,
      "reward_std": 0.09431184083223343,
      "rewards/gemini_judge_reward_func/mean": 0.1908482164144516,
      "rewards/gemini_judge_reward_func/std": 0.31251001358032227,
      "rewards/semantic_correctness_reward_func/mean": 0.507622241973877,
      "rewards/semantic_correctness_reward_func/std": 0.20726893842220306,
      "rewards/xmlcount_reward_func/mean": 0.735111653804779,
      "rewards/xmlcount_reward_func/std": 0.441826730966568,
      "step": 986
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 952.0,
      "completions/mean_length": 164.71429443359375,
      "completions/mean_terminated_length": 145.09588623046875,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.336989458409799,
      "grad_norm": 0.020227260887622833,
      "kl": 0.017192602157592773,
      "learning_rate": 2.9743315254743834e-08,
      "loss": 0.0006,
      "num_tokens": 351678310.0,
      "reward": 0.44850510358810425,
      "reward_std": 0.07030683010816574,
      "rewards/gemini_judge_reward_func/mean": 0.1573660671710968,
      "rewards/gemini_judge_reward_func/std": 0.2792554795742035,
      "rewards/semantic_correctness_reward_func/mean": 0.46204322576522827,
      "rewards/semantic_correctness_reward_func/std": 0.20317070186138153,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 987
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 705.0,
      "completions/mean_length": 164.22769165039062,
      "completions/mean_terminated_length": 140.564208984375,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.337330886432504,
      "grad_norm": 0.019894586876034737,
      "kl": 0.016435623168945312,
      "learning_rate": 2.7969659666999273e-08,
      "loss": -0.0137,
      "num_tokens": 352053657.0,
      "reward": 0.3776443302631378,
      "reward_std": 0.06739164888858795,
      "rewards/gemini_judge_reward_func/mean": 0.1417410671710968,
      "rewards/gemini_judge_reward_func/std": 0.2796315848827362,
      "rewards/semantic_correctness_reward_func/mean": 0.4249892830848694,
      "rewards/semantic_correctness_reward_func/std": 0.2107486128807068,
      "rewards/xmlcount_reward_func/mean": 0.5898750424385071,
      "rewards/xmlcount_reward_func/std": 0.493558406829834,
      "step": 988
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 328.0,
      "completions/mean_length": 155.8482208251953,
      "completions/mean_terminated_length": 140.06362915039062,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.3376723144552089,
      "grad_norm": 0.02049451507627964,
      "kl": 0.016164541244506836,
      "learning_rate": 2.625037740646763e-08,
      "loss": -0.0158,
      "num_tokens": 352410271.0,
      "reward": 0.4529190957546234,
      "reward_std": 0.07489325851202011,
      "rewards/gemini_judge_reward_func/mean": 0.1607142835855484,
      "rewards/gemini_judge_reward_func/std": 0.28872138261795044,
      "rewards/semantic_correctness_reward_func/mean": 0.44166675209999084,
      "rewards/semantic_correctness_reward_func/std": 0.2282264530658722,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 989
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 363.0,
      "completions/mean_length": 167.32589721679688,
      "completions/mean_terminated_length": 147.76712036132812,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.3380137424779139,
      "grad_norm": 0.02016194351017475,
      "kl": 0.012232065200805664,
      "learning_rate": 2.4585487274942922e-08,
      "loss": -0.0117,
      "num_tokens": 352787360.0,
      "reward": 0.4089185893535614,
      "reward_std": 0.056611210107803345,
      "rewards/gemini_judge_reward_func/mean": 0.1037946417927742,
      "rewards/gemini_judge_reward_func/std": 0.23580104112625122,
      "rewards/semantic_correctness_reward_func/mean": 0.4427536427974701,
      "rewards/semantic_correctness_reward_func/std": 0.2222413569688797,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903594970703,
      "step": 990
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 791.0,
      "completions/mean_length": 157.52679443359375,
      "completions/mean_terminated_length": 149.72071838378906,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.33835517050061886,
      "grad_norm": 0.019806455820798874,
      "kl": 0.014673709869384766,
      "learning_rate": 2.2975007479397736e-08,
      "loss": -0.0125,
      "num_tokens": 353139082.0,
      "reward": 0.415515273809433,
      "reward_std": 0.05204417183995247,
      "rewards/gemini_judge_reward_func/mean": 0.1071428582072258,
      "rewards/gemini_judge_reward_func/std": 0.1915077120065689,
      "rewards/semantic_correctness_reward_func/mean": 0.39754053950309753,
      "rewards/semantic_correctness_reward_func/std": 0.1915198266506195,
      "rewards/xmlcount_reward_func/mean": 0.7328749895095825,
      "rewards/xmlcount_reward_func/std": 0.44427838921546936,
      "step": 991
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 579.0,
      "completions/mean_length": 160.17857360839844,
      "completions/mean_terminated_length": 136.40367126464844,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.3386965985233238,
      "grad_norm": 0.021138962358236313,
      "kl": 0.014227151870727539,
      "learning_rate": 2.1418955631781203e-08,
      "loss": -0.0001,
      "num_tokens": 353492886.0,
      "reward": 0.406089186668396,
      "reward_std": 0.05585183575749397,
      "rewards/gemini_judge_reward_func/mean": 0.15625,
      "rewards/gemini_judge_reward_func/std": 0.28237661719322205,
      "rewards/semantic_correctness_reward_func/mean": 0.4666958451271057,
      "rewards/semantic_correctness_reward_func/std": 0.20080603659152985,
      "rewards/xmlcount_reward_func/mean": 0.6256250739097595,
      "rewards/xmlcount_reward_func/std": 0.48569241166114807,
      "step": 992
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 734.0,
      "completions/mean_length": 171.7678680419922,
      "completions/mean_terminated_length": 148.31192016601562,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.33903802654602877,
      "grad_norm": 0.019860416650772095,
      "kl": 0.014904022216796875,
      "learning_rate": 1.9917348748826337e-08,
      "loss": 0.0035,
      "num_tokens": 353868734.0,
      "reward": 0.41911885142326355,
      "reward_std": 0.06615443527698517,
      "rewards/gemini_judge_reward_func/mean": 0.1294642835855484,
      "rewards/gemini_judge_reward_func/std": 0.2603941261768341,
      "rewards/semantic_correctness_reward_func/mean": 0.4424155652523041,
      "rewards/semantic_correctness_reward_func/std": 0.21090011298656464,
      "rewards/xmlcount_reward_func/mean": 0.6971250176429749,
      "rewards/xmlcount_reward_func/std": 0.4612903892993927,
      "step": 993
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 264.0,
      "completions/mean_length": 158.16964721679688,
      "completions/mean_terminated_length": 142.42726135253906,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.33937945456873375,
      "grad_norm": 0.02125832624733448,
      "kl": 0.01460719108581543,
      "learning_rate": 1.847020325186577e-08,
      "loss": -0.018,
      "num_tokens": 354216172.0,
      "reward": 0.47000354528427124,
      "reward_std": 0.07803654670715332,
      "rewards/gemini_judge_reward_func/mean": 0.1607142835855484,
      "rewards/gemini_judge_reward_func/std": 0.2633373737335205,
      "rewards/semantic_correctness_reward_func/mean": 0.45558905601501465,
      "rewards/semantic_correctness_reward_func/std": 0.20520326495170593,
      "rewards/xmlcount_reward_func/mean": 0.7865000367164612,
      "rewards/xmlcount_reward_func/std": 0.41165614128112793,
      "step": 994
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 655.0,
      "completions/mean_length": 162.82589721679688,
      "completions/mean_terminated_length": 147.16818237304688,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.3397208825914387,
      "grad_norm": 0.02087085321545601,
      "kl": 0.013870716094970703,
      "learning_rate": 1.7077534966650767e-08,
      "loss": 0.0017,
      "num_tokens": 354569773.0,
      "reward": 0.4318699240684509,
      "reward_std": 0.055673111230134964,
      "rewards/gemini_judge_reward_func/mean": 0.1183035746216774,
      "rewards/gemini_judge_reward_func/std": 0.24595339596271515,
      "rewards/semantic_correctness_reward_func/mean": 0.4418494701385498,
      "rewards/semantic_correctness_reward_func/std": 0.21151991188526154,
      "rewards/xmlcount_reward_func/mean": 0.7404464483261108,
      "rewards/xmlcount_reward_func/std": 0.43912947177886963,
      "step": 995
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 282.0,
      "completions/mean_length": 149.60714721679688,
      "completions/mean_terminated_length": 145.6861114501953,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.34006231061414366,
      "grad_norm": 0.022532809525728226,
      "kl": 0.01392984390258789,
      "learning_rate": 1.5739359123178587e-08,
      "loss": -0.003,
      "num_tokens": 354903629.0,
      "reward": 0.47908294200897217,
      "reward_std": 0.06745254993438721,
      "rewards/gemini_judge_reward_func/mean": 0.1674107164144516,
      "rewards/gemini_judge_reward_func/std": 0.30849042534828186,
      "rewards/semantic_correctness_reward_func/mean": 0.4438968300819397,
      "rewards/semantic_correctness_reward_func/std": 0.23170118033885956,
      "rewards/xmlcount_reward_func/mean": 0.8083482384681702,
      "rewards/xmlcount_reward_func/std": 0.39272549748420715,
      "step": 996
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 305.0,
      "completions/mean_length": 166.58929443359375,
      "completions/mean_terminated_length": 142.99081420898438,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.34040373863684864,
      "grad_norm": 0.02883129194378853,
      "kl": 0.013774394989013672,
      "learning_rate": 1.4455690355525964e-08,
      "loss": -0.0131,
      "num_tokens": 355264021.0,
      "reward": 0.40281856060028076,
      "reward_std": 0.05309184268116951,
      "rewards/gemini_judge_reward_func/mean": 0.1183035746216774,
      "rewards/gemini_judge_reward_func/std": 0.23187628388404846,
      "rewards/semantic_correctness_reward_func/mean": 0.41898536682128906,
      "rewards/semantic_correctness_reward_func/std": 0.20515595376491547,
      "rewards/xmlcount_reward_func/mean": 0.6792500615119934,
      "rewards/xmlcount_reward_func/std": 0.46853893995285034,
      "step": 997
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.008928571428571397,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 383.0,
      "completions/mean_length": 152.8928680419922,
      "completions/mean_terminated_length": 145.0450439453125,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.34074516665955357,
      "grad_norm": 0.020875928923487663,
      "kl": 0.014181137084960938,
      "learning_rate": 1.3226542701689215e-08,
      "loss": -0.0043,
      "num_tokens": 355602493.0,
      "reward": 0.45096153020858765,
      "reward_std": 0.0548894889652729,
      "rewards/gemini_judge_reward_func/mean": 0.1506696492433548,
      "rewards/gemini_judge_reward_func/std": 0.27895063161849976,
      "rewards/semantic_correctness_reward_func/mean": 0.451968252658844,
      "rewards/semantic_correctness_reward_func/std": 0.22784963250160217,
      "rewards/xmlcount_reward_func/mean": 0.7507500052452087,
      "rewards/xmlcount_reward_func/std": 0.4344164729118347,
      "step": 998
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.004464285714285698,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 330.0,
      "completions/mean_length": 145.7991180419922,
      "completions/mean_terminated_length": 141.86099243164062,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.34108659468225855,
      "grad_norm": 0.02083824761211872,
      "kl": 0.015220165252685547,
      "learning_rate": 1.2051929603428824e-08,
      "loss": 0.0042,
      "num_tokens": 355963108.0,
      "reward": 0.4503902792930603,
      "reward_std": 0.06350675225257874,
      "rewards/gemini_judge_reward_func/mean": 0.1584821492433548,
      "rewards/gemini_judge_reward_func/std": 0.27303215861320496,
      "rewards/semantic_correctness_reward_func/mean": 0.4737100899219513,
      "rewards/semantic_correctness_reward_func/std": 0.23610134422779083,
      "rewards/xmlcount_reward_func/mean": 0.7306384444236755,
      "rewards/xmlcount_reward_func/std": 0.4441836178302765,
      "step": 999
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 425.0,
      "completions/mean_length": 161.3794708251953,
      "completions/mean_terminated_length": 137.63760375976562,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.34142802270496353,
      "grad_norm": 0.021282846108078957,
      "kl": 0.015776872634887695,
      "learning_rate": 1.0931863906127327e-08,
      "loss": 0.0138,
      "num_tokens": 356339045.0,
      "reward": 0.41461434960365295,
      "reward_std": 0.05614163354039192,
      "rewards/gemini_judge_reward_func/mean": 0.1138392835855484,
      "rewards/gemini_judge_reward_func/std": 0.24806062877178192,
      "rewards/semantic_correctness_reward_func/mean": 0.4153929352760315,
      "rewards/semantic_correctness_reward_func/std": 0.21967321634292603,
      "rewards/xmlcount_reward_func/mean": 0.7150000333786011,
      "rewards/xmlcount_reward_func/std": 0.4532184898853302,
      "step": 1000
    }
  ],
  "logging_steps": 1,
  "max_steps": 1000,
  "num_input_tokens_seen": 356339045,
  "num_train_epochs": 1,
  "save_steps": 100,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}