{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 100, "global_step": 4255, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1701.0, "completions/max_terminated_length": 1701.0, "completions/mean_length": 818.65625, "completions/mean_terminated_length": 818.65625, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "entropy": 0.47302141785621643, "epoch": 0.0011750881316098707, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.0, "num_tokens": 105866.0, "reward": 0.0, "reward_std": 0.0, "rewards/e2e_recall_precision_mixed_reward/mean": 0.0, "rewards/e2e_recall_precision_mixed_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998794794082642, "sampling/importance_sampling_ratio/min": 0.04267038777470589, "sampling/sampling_logp_difference/max": 3.154250144958496, "sampling/sampling_logp_difference/mean": 0.020557792857289314, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2188.0, "completions/max_terminated_length": 2159.5, "completions/mean_length": 957.0234375, "completions/mean_terminated_length": 950.7090911865234, "completions/min_length": 425.5, "completions/min_terminated_length": 425.5, "entropy": 0.5007665604352951, "epoch": 0.005875440658049354, "frac_reward_zero_std": 0.375, "grad_norm": 0.9459405541419983, "learning_rate": 1.5625e-08, "loss": 0.0304, "num_tokens": 591756.0, "reward": 0.140625, "reward_std": 0.1800631694495678, "rewards/e2e_recall_precision_mixed_reward/mean": 0.140625, "rewards/e2e_recall_precision_mixed_reward/std": 0.25662297010421753, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000313818454742, "sampling/importance_sampling_ratio/min": 0.2443008739501238, "sampling/sampling_logp_difference/max": 1.5048222541809082, "sampling/sampling_logp_difference/mean": 0.02111760200932622, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009375, "completions/max_length": 2233.0, "completions/max_terminated_length": 2072.4, "completions/mean_length": 950.425, "completions/mean_terminated_length": 937.0094848632813, "completions/min_length": 493.6, "completions/min_terminated_length": 493.6, "entropy": 0.5007948040962219, "epoch": 0.011750881316098707, "frac_reward_zero_std": 0.3, "grad_norm": 1.1323678493499756, "learning_rate": 3.515625e-08, "loss": -0.0117, "num_tokens": 1205448.0, "reward": 0.1359375, "reward_std": 0.1866186186671257, "rewards/e2e_recall_precision_mixed_reward/mean": 0.1359375, "rewards/e2e_recall_precision_mixed_reward/std": 0.2397767573595047, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000044345855713, "sampling/importance_sampling_ratio/min": 0.23340302407741548, "sampling/sampling_logp_difference/max": 1.5875318050384521, "sampling/sampling_logp_difference/mean": 0.021055334806442262, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021875, "completions/max_length": 2114.2, "completions/max_terminated_length": 1918.0, "completions/mean_length": 950.975, "completions/mean_terminated_length": 918.5927612304688, "completions/min_length": 437.6, "completions/min_terminated_length": 437.6, "entropy": 0.46904313564300537, "epoch": 0.01762632197414806, "frac_reward_zero_std": 0.3, "grad_norm": 1.0092235803604126, "learning_rate": 5.46875e-08, "loss": 0.0036, "num_tokens": 1803156.0, "reward": 0.17921874821186065, "reward_std": 0.1914873868227005, "rewards/e2e_recall_precision_mixed_reward/mean": 0.17921874821186065, "rewards/e2e_recall_precision_mixed_reward/std": 0.33369612991809844, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998673915863037, "sampling/importance_sampling_ratio/min": 0.15503151454031466, "sampling/sampling_logp_difference/max": 2.8057526469230654, "sampling/sampling_logp_difference/mean": 0.020007848739624023, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2388.8, "completions/max_terminated_length": 2238.0, "completions/mean_length": 1094.490625, "completions/mean_terminated_length": 1051.4914428710938, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "entropy": 0.49076443910598755, "epoch": 0.023501762632197415, "frac_reward_zero_std": 0.4, "grad_norm": 0.7749466896057129, "learning_rate": 7.421874999999999e-08, "loss": 0.0094, "num_tokens": 2455897.0, "reward": 0.10833333432674408, "reward_std": 0.15549785941839217, "rewards/e2e_recall_precision_mixed_reward/mean": 0.10833333432674408, "rewards/e2e_recall_precision_mixed_reward/std": 0.2179785817861557, "sampling/importance_sampling_ratio/max": 1.978114414215088, "sampling/importance_sampling_ratio/mean": 0.9999550819396973, "sampling/importance_sampling_ratio/min": 0.17499387562274932, "sampling/sampling_logp_difference/max": 1.8573448657989502, "sampling/sampling_logp_difference/mean": 0.02045784331858158, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 2429.6, "completions/max_terminated_length": 2392.4, "completions/mean_length": 968.49375, "completions/mean_terminated_length": 959.4304931640625, "completions/min_length": 429.2, "completions/min_terminated_length": 429.2, "entropy": 0.47716065049171447, "epoch": 0.02937720329024677, "frac_reward_zero_std": 0.55, "grad_norm": 0.4625045955181122, "learning_rate": 9.375e-08, "loss": 0.0078, "num_tokens": 3099823.0, "reward": 0.08489583395421504, "reward_std": 0.09943832308053971, "rewards/e2e_recall_precision_mixed_reward/mean": 0.08489583395421504, "rewards/e2e_recall_precision_mixed_reward/std": 0.17509274780750275, "sampling/importance_sampling_ratio/max": 1.9894845247268678, "sampling/importance_sampling_ratio/mean": 0.9998233199119568, "sampling/importance_sampling_ratio/min": 0.14390659239143133, "sampling/sampling_logp_difference/max": 2.679186391830444, "sampling/sampling_logp_difference/mean": 0.020350834354758263, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.028125, "completions/max_length": 2164.0, "completions/max_terminated_length": 2108.0, "completions/mean_length": 952.025, "completions/mean_terminated_length": 907.9393676757812, "completions/min_length": 417.8, "completions/min_terminated_length": 417.8, "entropy": 0.4895743727684021, "epoch": 0.03525264394829612, "frac_reward_zero_std": 0.6, "grad_norm": 0.0, "learning_rate": 1.1328125e-07, "loss": 0.0265, "num_tokens": 3705635.0, "reward": 0.109375, "reward_std": 0.11092274188995362, "rewards/e2e_recall_precision_mixed_reward/mean": 0.109375, "rewards/e2e_recall_precision_mixed_reward/std": 0.20727644562721254, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999130725860595, "sampling/importance_sampling_ratio/min": 0.19804175468862012, "sampling/sampling_logp_difference/max": 3.9093191623687744, "sampling/sampling_logp_difference/mean": 0.020513736456632615, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2022.4, "completions/max_terminated_length": 1915.6, "completions/mean_length": 915.609375, "completions/mean_terminated_length": 892.3630615234375, "completions/min_length": 491.6, "completions/min_terminated_length": 491.6, "entropy": 0.4711003482341766, "epoch": 0.041128084606345476, "frac_reward_zero_std": 0.15, "grad_norm": 1.1989802122116089, "learning_rate": 1.328125e-07, "loss": 0.0196, "num_tokens": 4306066.0, "reward": 0.19791666865348817, "reward_std": 0.2227712243795395, "rewards/e2e_recall_precision_mixed_reward/mean": 0.19791666865348817, "rewards/e2e_recall_precision_mixed_reward/std": 0.292997220158577, "sampling/importance_sampling_ratio/max": 1.9896412134170531, "sampling/importance_sampling_ratio/mean": 0.9999082565307618, "sampling/importance_sampling_ratio/min": 0.15322894011624158, "sampling/sampling_logp_difference/max": 2.785897207260132, "sampling/sampling_logp_difference/mean": 0.02024412974715233, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.040625, "completions/max_length": 1985.0, "completions/max_terminated_length": 1960.2, "completions/mean_length": 982.38125, "completions/mean_terminated_length": 922.0343994140625, "completions/min_length": 458.2, "completions/min_terminated_length": 458.2, "entropy": 0.47347159385681153, "epoch": 0.04700352526439483, "frac_reward_zero_std": 0.55, "grad_norm": 0.9034355282783508, "learning_rate": 1.5234375e-07, "loss": -0.0147, "num_tokens": 4929336.0, "reward": 0.09375, "reward_std": 0.13284323811531068, "rewards/e2e_recall_precision_mixed_reward/mean": 0.09375, "rewards/e2e_recall_precision_mixed_reward/std": 0.24181169271469116, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000688910484314, "sampling/importance_sampling_ratio/min": 0.19303356036543845, "sampling/sampling_logp_difference/max": 1.9485284805297851, "sampling/sampling_logp_difference/mean": 0.02038279250264168, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021875, "completions/max_length": 2342.0, "completions/max_terminated_length": 2276.4, "completions/mean_length": 1027.528125, "completions/mean_terminated_length": 996.1073974609375, "completions/min_length": 482.2, "completions/min_terminated_length": 482.2, "entropy": 0.5043671131134033, "epoch": 0.052878965922444184, "frac_reward_zero_std": 0.4, "grad_norm": 0.9663382172584534, "learning_rate": 1.71875e-07, "loss": 0.0037, "num_tokens": 5559637.0, "reward": 0.21223958134651183, "reward_std": 0.16447981745004653, "rewards/e2e_recall_precision_mixed_reward/mean": 0.21223958134651183, "rewards/e2e_recall_precision_mixed_reward/std": 0.2990086942911148, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000964164733888, "sampling/importance_sampling_ratio/min": 0.11464353739283979, "sampling/sampling_logp_difference/max": 2.9539984464645386, "sampling/sampling_logp_difference/mean": 0.021145598217844962, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2461.4, "completions/max_terminated_length": 2225.4, "completions/mean_length": 1047.071875, "completions/mean_terminated_length": 1001.1214721679687, "completions/min_length": 473.6, "completions/min_terminated_length": 473.6, "entropy": 0.48020014762878416, "epoch": 0.05875440658049354, "frac_reward_zero_std": 0.25, "grad_norm": 0.749260663986206, "learning_rate": 1.9140625e-07, "loss": 0.0315, "num_tokens": 6190292.0, "reward": 0.18177083432674407, "reward_std": 0.20050898492336272, "rewards/e2e_recall_precision_mixed_reward/mean": 0.18177083432674407, "rewards/e2e_recall_precision_mixed_reward/std": 0.2954167366027832, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999913454055787, "sampling/importance_sampling_ratio/min": 0.1635954909026623, "sampling/sampling_logp_difference/max": 2.131788170337677, "sampling/sampling_logp_difference/mean": 0.02059806026518345, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04375, "completions/max_length": 2298.0, "completions/max_terminated_length": 2198.2, "completions/mean_length": 1082.275, "completions/mean_terminated_length": 1020.4909301757813, "completions/min_length": 407.6, "completions/min_terminated_length": 407.6, "entropy": 0.495795863866806, "epoch": 0.06462984723854288, "frac_reward_zero_std": 0.35, "grad_norm": 0.8722389936447144, "learning_rate": 2.109375e-07, "loss": -0.0005, "num_tokens": 6832436.0, "reward": 0.1869791716337204, "reward_std": 0.17626949846744538, "rewards/e2e_recall_precision_mixed_reward/mean": 0.1869791716337204, "rewards/e2e_recall_precision_mixed_reward/std": 0.2800693780183792, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998305559158325, "sampling/importance_sampling_ratio/min": 0.2545178957283497, "sampling/sampling_logp_difference/max": 1.766669464111328, "sampling/sampling_logp_difference/mean": 0.020665578171610834, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2152.2, "completions/max_terminated_length": 2152.2, "completions/mean_length": 995.3125, "completions/mean_terminated_length": 995.3125, "completions/min_length": 499.8, "completions/min_terminated_length": 499.8, "entropy": 0.4970084547996521, "epoch": 0.07050528789659224, "frac_reward_zero_std": 0.45, "grad_norm": 1.0150976181030273, "learning_rate": 2.3046875e-07, "loss": 0.032, "num_tokens": 7475432.0, "reward": 0.19635416567325592, "reward_std": 0.14557099491357803, "rewards/e2e_recall_precision_mixed_reward/mean": 0.19635416567325592, "rewards/e2e_recall_precision_mixed_reward/std": 0.32434697151184083, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000025498867035, "sampling/importance_sampling_ratio/min": 0.2074445564299822, "sampling/sampling_logp_difference/max": 1.8555801391601563, "sampling/sampling_logp_difference/mean": 0.020710907503962518, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.034375, "completions/max_length": 2296.0, "completions/max_terminated_length": 2017.4, "completions/mean_length": 1097.91875, "completions/mean_terminated_length": 1054.652783203125, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "entropy": 0.4965181291103363, "epoch": 0.07638072855464159, "frac_reward_zero_std": 0.45, "grad_norm": 0.9392759799957275, "learning_rate": 2.5e-07, "loss": 0.0015, "num_tokens": 8140146.0, "reward": 0.1579166680574417, "reward_std": 0.1368572235107422, "rewards/e2e_recall_precision_mixed_reward/mean": 0.1579166680574417, "rewards/e2e_recall_precision_mixed_reward/std": 0.25851217806339266, "sampling/importance_sampling_ratio/max": 1.9590824365615844, "sampling/importance_sampling_ratio/mean": 0.9998887419700623, "sampling/importance_sampling_ratio/min": 0.2710499167442322, "sampling/sampling_logp_difference/max": 1.3179824352264404, "sampling/sampling_logp_difference/mean": 0.020697080716490745, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2351.6, "completions/max_terminated_length": 2013.8, "completions/mean_length": 972.6, "completions/mean_terminated_length": 948.5700805664062, "completions/min_length": 435.6, "completions/min_terminated_length": 435.6, "entropy": 0.5015838086605072, "epoch": 0.08225616921269095, "frac_reward_zero_std": 0.25, "grad_norm": 0.5987149477005005, "learning_rate": 2.6953125e-07, "loss": -0.0088, "num_tokens": 8764046.0, "reward": 0.15885416865348817, "reward_std": 0.16067830175161363, "rewards/e2e_recall_precision_mixed_reward/mean": 0.15885416865348817, "rewards/e2e_recall_precision_mixed_reward/std": 0.22674326300621034, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000308990478515, "sampling/importance_sampling_ratio/min": 0.18691894211806356, "sampling/sampling_logp_difference/max": 2.474703884124756, "sampling/sampling_logp_difference/mean": 0.020612315833568574, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 2124.0, "completions/max_terminated_length": 2083.6, "completions/mean_length": 988.415625, "completions/mean_terminated_length": 972.781884765625, "completions/min_length": 492.4, "completions/min_terminated_length": 492.4, "entropy": 0.48552640676498415, "epoch": 0.0881316098707403, "frac_reward_zero_std": 0.1, "grad_norm": 1.33731210231781, "learning_rate": 2.890625e-07, "loss": 0.0099, "num_tokens": 9408467.0, "reward": 0.24713541865348815, "reward_std": 0.2190377503633499, "rewards/e2e_recall_precision_mixed_reward/mean": 0.24713541865348815, "rewards/e2e_recall_precision_mixed_reward/std": 0.306401264667511, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999894917011261, "sampling/importance_sampling_ratio/min": 0.18375444859266282, "sampling/sampling_logp_difference/max": 1.751639199256897, "sampling/sampling_logp_difference/mean": 0.020667668804526328, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.028125, "completions/max_length": 2400.6, "completions/max_terminated_length": 2171.8, "completions/mean_length": 990.3125, "completions/mean_terminated_length": 943.2689819335938, "completions/min_length": 443.4, "completions/min_terminated_length": 443.4, "entropy": 0.49701812863349915, "epoch": 0.09400705052878966, "frac_reward_zero_std": 0.3, "grad_norm": 1.343873143196106, "learning_rate": 3.0859375e-07, "loss": -0.0044, "num_tokens": 10033683.0, "reward": 0.17656249850988387, "reward_std": 0.16451094299554825, "rewards/e2e_recall_precision_mixed_reward/mean": 0.17656250447034835, "rewards/e2e_recall_precision_mixed_reward/std": 0.2665316700935364, "sampling/importance_sampling_ratio/max": 1.988339638710022, "sampling/importance_sampling_ratio/mean": 0.9998800039291382, "sampling/importance_sampling_ratio/min": 0.18474510461091995, "sampling/sampling_logp_difference/max": 1.813150119781494, "sampling/sampling_logp_difference/mean": 0.02064768560230732, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2283.6, "completions/max_terminated_length": 1947.8, "completions/mean_length": 964.471875, "completions/mean_terminated_length": 916.251416015625, "completions/min_length": 444.8, "completions/min_terminated_length": 444.8, "entropy": 0.47441959381103516, "epoch": 0.099882491186839, "frac_reward_zero_std": 0.3, "grad_norm": 1.2677315473556519, "learning_rate": 3.28125e-07, "loss": -0.0342, "num_tokens": 10654690.0, "reward": 0.16875, "reward_std": 0.16111062318086625, "rewards/e2e_recall_precision_mixed_reward/mean": 0.16875000149011612, "rewards/e2e_recall_precision_mixed_reward/std": 0.25565315783023834, "sampling/importance_sampling_ratio/max": 1.969423198699951, "sampling/importance_sampling_ratio/mean": 1.000089454650879, "sampling/importance_sampling_ratio/min": 0.14717500358819963, "sampling/sampling_logp_difference/max": 2.103808379173279, "sampling/sampling_logp_difference/mean": 0.02052120789885521, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 2336.6, "completions/max_terminated_length": 2279.0, "completions/mean_length": 1002.2875, "completions/mean_terminated_length": 984.4052978515625, "completions/min_length": 415.4, "completions/min_terminated_length": 415.4, "entropy": 0.4757814884185791, "epoch": 0.10575793184488837, "frac_reward_zero_std": 0.15, "grad_norm": 0.9901517033576965, "learning_rate": 3.4765625e-07, "loss": -0.0046, "num_tokens": 11290958.0, "reward": 0.2846354186534882, "reward_std": 0.27473083734512327, "rewards/e2e_recall_precision_mixed_reward/mean": 0.2846354186534882, "rewards/e2e_recall_precision_mixed_reward/std": 0.3511134922504425, "sampling/importance_sampling_ratio/max": 1.9479135990142822, "sampling/importance_sampling_ratio/mean": 0.9998469710350036, "sampling/importance_sampling_ratio/min": 0.2177226183936, "sampling/sampling_logp_difference/max": 1.9894657850265502, "sampling/sampling_logp_difference/mean": 0.019795811921358108, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009375, "completions/max_length": 2199.2, "completions/max_terminated_length": 2174.0, "completions/mean_length": 993.784375, "completions/mean_terminated_length": 981.0612548828125, "completions/min_length": 410.8, "completions/min_terminated_length": 410.8, "entropy": 0.5097217082977294, "epoch": 0.11163337250293771, "frac_reward_zero_std": 0.2, "grad_norm": 1.1467088460922241, "learning_rate": 3.671875e-07, "loss": 0.0162, "num_tokens": 11907597.0, "reward": 0.28682292252779007, "reward_std": 0.24061587154865266, "rewards/e2e_recall_precision_mixed_reward/mean": 0.28682292252779007, "rewards/e2e_recall_precision_mixed_reward/std": 0.3374872386455536, "sampling/importance_sampling_ratio/max": 1.9729577779769898, "sampling/importance_sampling_ratio/mean": 1.0000439643859864, "sampling/importance_sampling_ratio/min": 0.2012358859181404, "sampling/sampling_logp_difference/max": 1.7005351781845093, "sampling/sampling_logp_difference/mean": 0.02079613581299782, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 2030.2, "completions/max_terminated_length": 1979.8, "completions/mean_length": 920.021875, "completions/mean_terminated_length": 902.3637084960938, "completions/min_length": 424.6, "completions/min_terminated_length": 424.6, "entropy": 0.47220299243927, "epoch": 0.11750881316098707, "frac_reward_zero_std": 0.1, "grad_norm": 1.0923038721084595, "learning_rate": 3.8671875e-07, "loss": -0.0021, "num_tokens": 12509988.0, "reward": 0.3457812681794167, "reward_std": 0.2333272099494934, "rewards/e2e_recall_precision_mixed_reward/mean": 0.3457812681794167, "rewards/e2e_recall_precision_mixed_reward/std": 0.31239897608757017, "sampling/importance_sampling_ratio/max": 1.989612627029419, "sampling/importance_sampling_ratio/mean": 0.9997779369354248, "sampling/importance_sampling_ratio/min": 0.2512615159153938, "sampling/sampling_logp_difference/max": 1.8858316898345948, "sampling/sampling_logp_difference/mean": 0.02036282978951931, "step": 100 }, { "epoch": 0.11750881316098707, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.00875, "eval_completions/max_length": 2168.76, "eval_completions/max_terminated_length": 2077.44, "eval_completions/mean_length": 973.209375, "eval_completions/mean_terminated_length": 960.3953955078125, "eval_completions/min_length": 450.52, "eval_completions/min_terminated_length": 450.52, "eval_entropy": 0.4852174758911133, "eval_frac_reward_zero_std": 0.28, "eval_loss": 0.009060491807758808, "eval_num_tokens": 12509988.0, "eval_reward": 0.3109791725873947, "eval_reward_std": 0.18228219971060752, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.31097917556762694, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.317981595993042, "eval_runtime": 666.9163, "eval_samples_per_second": 0.15, "eval_sampling/importance_sampling_ratio/max": 1.9854744291305542, "eval_sampling/importance_sampling_ratio/mean": 0.9999976515769958, "eval_sampling/importance_sampling_ratio/min": 0.2449254010617733, "eval_sampling/sampling_logp_difference/max": 1.6110917520523071, "eval_sampling/sampling_logp_difference/mean": 0.020074035078287124, "eval_steps_per_second": 0.003, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 2408.8, "completions/max_terminated_length": 2266.6, "completions/mean_length": 1018.646875, "completions/mean_terminated_length": 981.5686157226562, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "entropy": 0.48829834461212157, "epoch": 0.12338425381903642, "frac_reward_zero_std": 0.15, "grad_norm": 1.136791706085205, "learning_rate": 4.0625e-07, "loss": 0.0196, "num_tokens": 13133331.0, "reward": 0.32442708015441896, "reward_std": 0.2731425791978836, "rewards/e2e_recall_precision_mixed_reward/mean": 0.3244270861148834, "rewards/e2e_recall_precision_mixed_reward/std": 0.39561856985092164, "sampling/importance_sampling_ratio/max": 1.969839334487915, "sampling/importance_sampling_ratio/mean": 1.00008225440979, "sampling/importance_sampling_ratio/min": 0.27382618486881255, "sampling/sampling_logp_difference/max": 1.4294708490371704, "sampling/sampling_logp_difference/mean": 0.020213060453534125, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 2152.0, "completions/max_terminated_length": 2136.8, "completions/mean_length": 971.184375, "completions/mean_terminated_length": 962.189306640625, "completions/min_length": 476.8, "completions/min_terminated_length": 476.8, "entropy": 0.4925812900066376, "epoch": 0.12925969447708577, "frac_reward_zero_std": 0.25, "grad_norm": 1.1485499143600464, "learning_rate": 4.2578124999999997e-07, "loss": -0.0082, "num_tokens": 13765366.0, "reward": 0.3632812604308128, "reward_std": 0.2154562935233116, "rewards/e2e_recall_precision_mixed_reward/mean": 0.3632812604308128, "rewards/e2e_recall_precision_mixed_reward/std": 0.3520397037267685, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999819278717041, "sampling/importance_sampling_ratio/min": 0.2579982398077846, "sampling/sampling_logp_difference/max": 1.8680977821350098, "sampling/sampling_logp_difference/mean": 0.0207187470048666, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 2001.8, "completions/max_terminated_length": 1935.0, "completions/mean_length": 925.90625, "completions/mean_terminated_length": 920.6155395507812, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "entropy": 0.4797549247741699, "epoch": 0.13513513513513514, "frac_reward_zero_std": 0.25, "grad_norm": 0.7465731501579285, "learning_rate": 4.4531249999999997e-07, "loss": -0.0196, "num_tokens": 14365572.0, "reward": 0.4515625059604645, "reward_std": 0.1962999165058136, "rewards/e2e_recall_precision_mixed_reward/mean": 0.4515625059604645, "rewards/e2e_recall_precision_mixed_reward/std": 0.36195426285266874, "sampling/importance_sampling_ratio/max": 1.9571106672286986, "sampling/importance_sampling_ratio/mean": 0.9999866843223572, "sampling/importance_sampling_ratio/min": 0.2458883583545685, "sampling/sampling_logp_difference/max": 1.5592980146408082, "sampling/sampling_logp_difference/mean": 0.02007099725306034, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 2122.8, "completions/max_terminated_length": 1935.6, "completions/mean_length": 1038.65, "completions/mean_terminated_length": 1022.6489013671875, "completions/min_length": 478.6, "completions/min_terminated_length": 478.6, "entropy": 0.4651613235473633, "epoch": 0.1410105757931845, "frac_reward_zero_std": 0.15, "grad_norm": 0.8895966410636902, "learning_rate": 4.6484374999999997e-07, "loss": -0.0124, "num_tokens": 15020500.0, "reward": 0.3198958396911621, "reward_std": 0.20608305782079697, "rewards/e2e_recall_precision_mixed_reward/mean": 0.3198958396911621, "rewards/e2e_recall_precision_mixed_reward/std": 0.29388985931873324, "sampling/importance_sampling_ratio/max": 1.958699345588684, "sampling/importance_sampling_ratio/mean": 1.0000421762466432, "sampling/importance_sampling_ratio/min": 0.26278833746910096, "sampling/sampling_logp_difference/max": 1.4563037395477294, "sampling/sampling_logp_difference/mean": 0.019570792466402052, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.059375, "completions/max_length": 2256.8, "completions/max_terminated_length": 2058.4, "completions/mean_length": 1109.3, "completions/mean_terminated_length": 1020.2399169921875, "completions/min_length": 480.6, "completions/min_terminated_length": 480.6, "entropy": 0.4841072797775269, "epoch": 0.14688601645123384, "frac_reward_zero_std": 0.25, "grad_norm": 0.8954580426216125, "learning_rate": 4.84375e-07, "loss": -0.0518, "num_tokens": 15635352.0, "reward": 0.3098958432674408, "reward_std": 0.16147686839103698, "rewards/e2e_recall_precision_mixed_reward/mean": 0.3098958432674408, "rewards/e2e_recall_precision_mixed_reward/std": 0.3426029205322266, "sampling/importance_sampling_ratio/max": 1.9685364723205567, "sampling/importance_sampling_ratio/mean": 1.0001961946487428, "sampling/importance_sampling_ratio/min": 0.3116583779454231, "sampling/sampling_logp_difference/max": 1.2820564270019532, "sampling/sampling_logp_difference/mean": 0.020258011296391487, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 2127.8, "completions/max_terminated_length": 2040.8, "completions/mean_length": 1004.88125, "completions/mean_terminated_length": 1000.2262573242188, "completions/min_length": 502.6, "completions/min_terminated_length": 502.6, "entropy": 0.4659832537174225, "epoch": 0.15276145710928318, "frac_reward_zero_std": 0.25, "grad_norm": 0.921988308429718, "learning_rate": 4.998788466198207e-07, "loss": -0.0002, "num_tokens": 16296942.0, "reward": 0.36302084624767306, "reward_std": 0.19413625001907348, "rewards/e2e_recall_precision_mixed_reward/mean": 0.36302084624767306, "rewards/e2e_recall_precision_mixed_reward/std": 0.3169731110334396, "sampling/importance_sampling_ratio/max": 1.9976950645446778, "sampling/importance_sampling_ratio/mean": 1.000005567073822, "sampling/importance_sampling_ratio/min": 0.2085201695561409, "sampling/sampling_logp_difference/max": 1.7547591209411622, "sampling/sampling_logp_difference/mean": 0.019374296069145203, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021875, "completions/max_length": 2315.2, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1105.909375, "completions/mean_terminated_length": 1075.3181396484374, "completions/min_length": 484.6, "completions/min_terminated_length": 484.6, "entropy": 0.4691031098365784, "epoch": 0.15863689776733256, "frac_reward_zero_std": 0.2, "grad_norm": 0.7131484150886536, "learning_rate": 4.992730797189241e-07, "loss": -0.0383, "num_tokens": 16982229.0, "reward": 0.2643229216337204, "reward_std": 0.1752532333135605, "rewards/e2e_recall_precision_mixed_reward/mean": 0.2643229216337204, "rewards/e2e_recall_precision_mixed_reward/std": 0.2676455333828926, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999748706817627, "sampling/importance_sampling_ratio/min": 0.24143882989883422, "sampling/sampling_logp_difference/max": 1.5030804395675659, "sampling/sampling_logp_difference/mean": 0.019500917568802834, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.028125, "completions/max_length": 2466.6, "completions/max_terminated_length": 2179.8, "completions/mean_length": 1064.678125, "completions/mean_terminated_length": 1018.6744384765625, "completions/min_length": 527.2, "completions/min_terminated_length": 527.2, "entropy": 0.47563568949699403, "epoch": 0.1645123384253819, "frac_reward_zero_std": 0.15, "grad_norm": 1.0195895433425903, "learning_rate": 4.986673128180276e-07, "loss": 0.0268, "num_tokens": 17612586.0, "reward": 0.3347395837306976, "reward_std": 0.2197277307510376, "rewards/e2e_recall_precision_mixed_reward/mean": 0.3347395837306976, "rewards/e2e_recall_precision_mixed_reward/std": 0.31412242650985717, "sampling/importance_sampling_ratio/max": 1.9938777923583983, "sampling/importance_sampling_ratio/mean": 0.9999652743339539, "sampling/importance_sampling_ratio/min": 0.09777447709363969, "sampling/sampling_logp_difference/max": 4.446249318122864, "sampling/sampling_logp_difference/mean": 0.01978233680129051, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2201.0, "completions/max_terminated_length": 2164.0, "completions/mean_length": 1086.365625, "completions/mean_terminated_length": 1063.9522705078125, "completions/min_length": 471.8, "completions/min_terminated_length": 471.8, "entropy": 0.46193733215332033, "epoch": 0.17038777908343125, "frac_reward_zero_std": 0.15, "grad_norm": 1.0625224113464355, "learning_rate": 4.980615459171311e-07, "loss": 0.0038, "num_tokens": 18280347.0, "reward": 0.47427083253860475, "reward_std": 0.22851565778255462, "rewards/e2e_recall_precision_mixed_reward/mean": 0.47427083253860475, "rewards/e2e_recall_precision_mixed_reward/std": 0.3152575194835663, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000032937526703, "sampling/importance_sampling_ratio/min": 0.23960502099653241, "sampling/sampling_logp_difference/max": 2.9025020360946656, "sampling/sampling_logp_difference/mean": 0.019664775207638742, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 2436.0, "completions/max_terminated_length": 2295.6, "completions/mean_length": 1323.515625, "completions/mean_terminated_length": 1297.6546875, "completions/min_length": 648.4, "completions/min_terminated_length": 648.4, "entropy": 0.44961647391319276, "epoch": 0.1762632197414806, "frac_reward_zero_std": 0.05, "grad_norm": 0.8402144312858582, "learning_rate": 4.974557790162345e-07, "loss": 0.0206, "num_tokens": 19023648.0, "reward": 0.40973958671092986, "reward_std": 0.242001411318779, "rewards/e2e_recall_precision_mixed_reward/mean": 0.40973958671092986, "rewards/e2e_recall_precision_mixed_reward/std": 0.30689987242221833, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000560522079467, "sampling/importance_sampling_ratio/min": 0.21943920934572816, "sampling/sampling_logp_difference/max": 2.0875226736068724, "sampling/sampling_logp_difference/mean": 0.018646536394953728, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009375, "completions/max_length": 2189.8, "completions/max_terminated_length": 2100.0, "completions/mean_length": 1232.584375, "completions/mean_terminated_length": 1221.488134765625, "completions/min_length": 596.0, "completions/min_terminated_length": 596.0, "entropy": 0.4616339147090912, "epoch": 0.18213866039952997, "frac_reward_zero_std": 0.15, "grad_norm": 0.8758718967437744, "learning_rate": 4.96850012115338e-07, "loss": -0.0048, "num_tokens": 19727359.0, "reward": 0.3982812583446503, "reward_std": 0.23960140347480774, "rewards/e2e_recall_precision_mixed_reward/mean": 0.3982812583446503, "rewards/e2e_recall_precision_mixed_reward/std": 0.3587407112121582, "sampling/importance_sampling_ratio/max": 1.988889455795288, "sampling/importance_sampling_ratio/mean": 1.0001670956611632, "sampling/importance_sampling_ratio/min": 0.3248328477144241, "sampling/sampling_logp_difference/max": 1.1651965141296388, "sampling/sampling_logp_difference/mean": 0.01926850378513336, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2299.6, "completions/max_terminated_length": 2056.8, "completions/mean_length": 1074.50625, "completions/mean_terminated_length": 1053.7550659179688, "completions/min_length": 482.2, "completions/min_terminated_length": 482.2, "entropy": 0.44536136984825136, "epoch": 0.18801410105757932, "frac_reward_zero_std": 0.15, "grad_norm": 0.8042799830436707, "learning_rate": 4.962442452144414e-07, "loss": 0.0248, "num_tokens": 20380797.0, "reward": 0.4602083504199982, "reward_std": 0.21384959816932678, "rewards/e2e_recall_precision_mixed_reward/mean": 0.4602083384990692, "rewards/e2e_recall_precision_mixed_reward/std": 0.3380684912204742, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000497698783875, "sampling/importance_sampling_ratio/min": 0.24892064929008484, "sampling/sampling_logp_difference/max": 1.4376299142837525, "sampling/sampling_logp_difference/mean": 0.019230544194579126, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.028125, "completions/max_length": 2365.4, "completions/max_terminated_length": 2204.2, "completions/mean_length": 1309.959375, "completions/mean_terminated_length": 1276.075146484375, "completions/min_length": 693.6, "completions/min_terminated_length": 693.6, "entropy": 0.45925586819648745, "epoch": 0.19388954171562867, "frac_reward_zero_std": 0.05, "grad_norm": 0.8627838492393494, "learning_rate": 4.956384783135449e-07, "loss": 0.0015, "num_tokens": 21111084.0, "reward": 0.4284895807504654, "reward_std": 0.24629194140434266, "rewards/e2e_recall_precision_mixed_reward/mean": 0.4284895807504654, "rewards/e2e_recall_precision_mixed_reward/std": 0.32105106115341187, "sampling/importance_sampling_ratio/max": 1.9395444869995118, "sampling/importance_sampling_ratio/mean": 0.9999743103981018, "sampling/importance_sampling_ratio/min": 0.3220216006040573, "sampling/sampling_logp_difference/max": 1.197959566116333, "sampling/sampling_logp_difference/mean": 0.018676093593239785, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.071875, "completions/max_length": 2413.2, "completions/max_terminated_length": 2344.0, "completions/mean_length": 1377.446875, "completions/mean_terminated_length": 1291.2495361328124, "completions/min_length": 661.8, "completions/min_terminated_length": 661.8, "entropy": 0.44391797184944154, "epoch": 0.199764982373678, "frac_reward_zero_std": 0.2, "grad_norm": 0.5744222402572632, "learning_rate": 4.950327114126484e-07, "loss": -0.0102, "num_tokens": 21817791.0, "reward": 0.37614584267139434, "reward_std": 0.18710350692272187, "rewards/e2e_recall_precision_mixed_reward/mean": 0.37614584267139434, "rewards/e2e_recall_precision_mixed_reward/std": 0.3078536331653595, "sampling/importance_sampling_ratio/max": 1.932727074623108, "sampling/importance_sampling_ratio/mean": 0.9998797655105591, "sampling/importance_sampling_ratio/min": 0.17999765202403067, "sampling/sampling_logp_difference/max": 2.0344920635223387, "sampling/sampling_logp_difference/mean": 0.018212152644991875, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.053125, "completions/max_length": 2500.0, "completions/max_terminated_length": 2317.2, "completions/mean_length": 1387.946875, "completions/mean_terminated_length": 1327.648046875, "completions/min_length": 684.8, "completions/min_terminated_length": 684.8, "entropy": 0.4456583082675934, "epoch": 0.2056404230317274, "frac_reward_zero_std": 0.2, "grad_norm": 1.0140538215637207, "learning_rate": 4.944269445117519e-07, "loss": -0.0183, "num_tokens": 22518842.0, "reward": 0.3370312511920929, "reward_std": 0.19164448380470275, "rewards/e2e_recall_precision_mixed_reward/mean": 0.337031252682209, "rewards/e2e_recall_precision_mixed_reward/std": 0.29072641432285307, "sampling/importance_sampling_ratio/max": 1.9975413084030151, "sampling/importance_sampling_ratio/mean": 1.0000341892242433, "sampling/importance_sampling_ratio/min": 0.31241180300712584, "sampling/sampling_logp_difference/max": 1.204920768737793, "sampling/sampling_logp_difference/mean": 0.01850493885576725, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021875, "completions/max_length": 2447.2, "completions/max_terminated_length": 2257.8, "completions/mean_length": 1337.359375, "completions/mean_terminated_length": 1312.4825439453125, "completions/min_length": 656.2, "completions/min_terminated_length": 656.2, "entropy": 0.45110672116279604, "epoch": 0.21151586368977673, "frac_reward_zero_std": 0.05, "grad_norm": 0.9597908854484558, "learning_rate": 4.938211776108554e-07, "loss": -0.0207, "num_tokens": 23240145.0, "reward": 0.4163541615009308, "reward_std": 0.21045998930931092, "rewards/e2e_recall_precision_mixed_reward/mean": 0.4163541793823242, "rewards/e2e_recall_precision_mixed_reward/std": 0.289658859372139, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000091910362243, "sampling/importance_sampling_ratio/min": 0.2906631052494049, "sampling/sampling_logp_difference/max": 1.337960433959961, "sampling/sampling_logp_difference/mean": 0.018258562311530113, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 2301.8, "completions/max_terminated_length": 2122.8, "completions/mean_length": 1253.5125, "completions/mean_terminated_length": 1237.3944091796875, "completions/min_length": 634.2, "completions/min_terminated_length": 634.2, "entropy": 0.43949020504951475, "epoch": 0.21739130434782608, "frac_reward_zero_std": 0.15, "grad_norm": 1.036664366722107, "learning_rate": 4.932154107099588e-07, "loss": -0.0049, "num_tokens": 23945637.0, "reward": 0.3103645980358124, "reward_std": 0.22137612998485565, "rewards/e2e_recall_precision_mixed_reward/mean": 0.3103645980358124, "rewards/e2e_recall_precision_mixed_reward/std": 0.3476561546325684, "sampling/importance_sampling_ratio/max": 1.9506564617156983, "sampling/importance_sampling_ratio/mean": 0.9999400854110718, "sampling/importance_sampling_ratio/min": 0.23950822800397872, "sampling/sampling_logp_difference/max": 1.6213460922241212, "sampling/sampling_logp_difference/mean": 0.018114538118243217, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01875, "completions/max_length": 2325.6, "completions/max_terminated_length": 2258.8, "completions/mean_length": 1329.1875, "completions/mean_terminated_length": 1308.8904541015625, "completions/min_length": 696.0, "completions/min_terminated_length": 696.0, "entropy": 0.42851370573043823, "epoch": 0.22326674500587543, "frac_reward_zero_std": 0.1, "grad_norm": 0.8615764379501343, "learning_rate": 4.926096438090623e-07, "loss": -0.0222, "num_tokens": 24671129.0, "reward": 0.45187500715255735, "reward_std": 0.19458201229572297, "rewards/e2e_recall_precision_mixed_reward/mean": 0.45187500715255735, "rewards/e2e_recall_precision_mixed_reward/std": 0.32191779315471647, "sampling/importance_sampling_ratio/max": 1.9079566240310668, "sampling/importance_sampling_ratio/mean": 1.0002107262611388, "sampling/importance_sampling_ratio/min": 0.2823118090629578, "sampling/sampling_logp_difference/max": 1.2879379272460938, "sampling/sampling_logp_difference/mean": 0.01770188324153423, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 2432.4, "completions/max_terminated_length": 2339.2, "completions/mean_length": 1379.309375, "completions/mean_terminated_length": 1366.236669921875, "completions/min_length": 681.4, "completions/min_terminated_length": 681.4, "entropy": 0.43162922859191893, "epoch": 0.2291421856639248, "frac_reward_zero_std": 0.2, "grad_norm": 0.5466075539588928, "learning_rate": 4.920038769081657e-07, "loss": 0.0155, "num_tokens": 25448412.0, "reward": 0.3679687574505806, "reward_std": 0.1814160704612732, "rewards/e2e_recall_precision_mixed_reward/mean": 0.3679687574505806, "rewards/e2e_recall_precision_mixed_reward/std": 0.26374678313732147, "sampling/importance_sampling_ratio/max": 1.9999159097671508, "sampling/importance_sampling_ratio/mean": 1.0000950813293457, "sampling/importance_sampling_ratio/min": 0.2659080035984516, "sampling/sampling_logp_difference/max": 1.629682731628418, "sampling/sampling_logp_difference/mean": 0.017587186023592948, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 2394.8, "completions/max_terminated_length": 2268.2, "completions/mean_length": 1396.196875, "completions/mean_terminated_length": 1368.7766845703125, "completions/min_length": 593.6, "completions/min_terminated_length": 593.6, "entropy": 0.45481058955192566, "epoch": 0.23501762632197415, "frac_reward_zero_std": 0.1, "grad_norm": 0.5931362509727478, "learning_rate": 4.913981100072691e-07, "loss": -0.0407, "num_tokens": 26210379.0, "reward": 0.4175000131130219, "reward_std": 0.23002639412879944, "rewards/e2e_recall_precision_mixed_reward/mean": 0.4175000131130219, "rewards/e2e_recall_precision_mixed_reward/std": 0.33679488897323606, "sampling/importance_sampling_ratio/max": 1.951190209388733, "sampling/importance_sampling_ratio/mean": 0.9999919533729553, "sampling/importance_sampling_ratio/min": 0.2368324212729931, "sampling/sampling_logp_difference/max": 1.608810019493103, "sampling/sampling_logp_difference/mean": 0.01873408742249012, "step": 200 }, { "epoch": 0.23501762632197415, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.014375, "eval_completions/max_length": 2369.72, "eval_completions/max_terminated_length": 2250.0, "eval_completions/mean_length": 1299.76125, "eval_completions/mean_terminated_length": 1283.172724609375, "eval_completions/min_length": 709.2, "eval_completions/min_terminated_length": 709.2, "eval_entropy": 0.42435838222503663, "eval_frac_reward_zero_std": 0.14, "eval_loss": 0.002923845313489437, "eval_num_tokens": 26210379.0, "eval_reward": 0.422958345413208, "eval_reward_std": 0.1989289104938507, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.4229583466053009, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.31139856338500976, "eval_runtime": 603.448, "eval_samples_per_second": 0.166, "eval_sampling/importance_sampling_ratio/max": 1.9720039033889771, "eval_sampling/importance_sampling_ratio/mean": 1.0000151467323304, "eval_sampling/importance_sampling_ratio/min": 0.2624286452680826, "eval_sampling/sampling_logp_difference/max": 1.560486044883728, "eval_sampling/sampling_logp_difference/mean": 0.017812692523002625, "eval_steps_per_second": 0.003, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 2368.2, "completions/max_terminated_length": 2345.2, "completions/mean_length": 1356.390625, "completions/mean_terminated_length": 1348.66416015625, "completions/min_length": 700.8, "completions/min_terminated_length": 700.8, "entropy": 0.42209169268608093, "epoch": 0.2408930669800235, "frac_reward_zero_std": 0.2, "grad_norm": 0.8277153372764587, "learning_rate": 4.907923431063726e-07, "loss": 0.0107, "num_tokens": 26975488.0, "reward": 0.45453126430511476, "reward_std": 0.20152193903923035, "rewards/e2e_recall_precision_mixed_reward/mean": 0.45453126430511476, "rewards/e2e_recall_precision_mixed_reward/std": 0.3548352122306824, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000168442726136, "sampling/importance_sampling_ratio/min": 0.2511422336101532, "sampling/sampling_logp_difference/max": 1.4191609382629395, "sampling/sampling_logp_difference/mean": 0.017601443454623222, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2207.8, "completions/max_terminated_length": 2191.8, "completions/mean_length": 1347.0625, "completions/mean_terminated_length": 1314.820458984375, "completions/min_length": 608.6, "completions/min_terminated_length": 608.6, "entropy": 0.43602086901664733, "epoch": 0.24676850763807284, "frac_reward_zero_std": 0.0, "grad_norm": 0.9807378649711609, "learning_rate": 4.901865762054761e-07, "loss": -0.0325, "num_tokens": 27692204.0, "reward": 0.40880208611488345, "reward_std": 0.248151096701622, "rewards/e2e_recall_precision_mixed_reward/mean": 0.4088020920753479, "rewards/e2e_recall_precision_mixed_reward/std": 0.30966946482658386, "sampling/importance_sampling_ratio/max": 1.934563159942627, "sampling/importance_sampling_ratio/mean": 0.9999630689620972, "sampling/importance_sampling_ratio/min": 0.16581638418138028, "sampling/sampling_logp_difference/max": 2.415492820739746, "sampling/sampling_logp_difference/mean": 0.01788683459162712, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01875, "completions/max_length": 2410.6, "completions/max_terminated_length": 2362.2, "completions/mean_length": 1393.41875, "completions/mean_terminated_length": 1373.951611328125, "completions/min_length": 784.0, "completions/min_terminated_length": 784.0, "entropy": 0.4307835817337036, "epoch": 0.2526439482961222, "frac_reward_zero_std": 0.15, "grad_norm": 0.9995214343070984, "learning_rate": 4.895808093045796e-07, "loss": 0.0368, "num_tokens": 28450426.0, "reward": 0.46427084505558014, "reward_std": 0.21769410669803618, "rewards/e2e_recall_precision_mixed_reward/mean": 0.46427084505558014, "rewards/e2e_recall_precision_mixed_reward/std": 0.34339686036109923, "sampling/importance_sampling_ratio/max": 1.9890089750289917, "sampling/importance_sampling_ratio/mean": 0.9999396681785584, "sampling/importance_sampling_ratio/min": 0.16661263704299928, "sampling/sampling_logp_difference/max": 1.9397084712982178, "sampling/sampling_logp_difference/mean": 0.01795981228351593, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2269.0, "completions/max_terminated_length": 2211.8, "completions/mean_length": 1407.796875, "completions/mean_terminated_length": 1390.346142578125, "completions/min_length": 765.2, "completions/min_terminated_length": 765.2, "entropy": 0.42940289378166197, "epoch": 0.25851938895417154, "frac_reward_zero_std": 0.2, "grad_norm": 0.9037354588508606, "learning_rate": 4.889750424036831e-07, "loss": -0.0166, "num_tokens": 29224677.0, "reward": 0.4435416698455811, "reward_std": 0.17664896845817565, "rewards/e2e_recall_precision_mixed_reward/mean": 0.4435416698455811, "rewards/e2e_recall_precision_mixed_reward/std": 0.29355489313602445, "sampling/importance_sampling_ratio/max": 1.9557361602783203, "sampling/importance_sampling_ratio/mean": 0.9999936103820801, "sampling/importance_sampling_ratio/min": 0.30081471651792524, "sampling/sampling_logp_difference/max": 1.4767087697982788, "sampling/sampling_logp_difference/mean": 0.018042086437344552, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.028125, "completions/max_length": 2500.0, "completions/max_terminated_length": 2453.4, "completions/mean_length": 1509.70625, "completions/mean_terminated_length": 1481.46875, "completions/min_length": 920.2, "completions/min_terminated_length": 920.2, "entropy": 0.4266699433326721, "epoch": 0.26439482961222094, "frac_reward_zero_std": 0.15, "grad_norm": 0.4904046356678009, "learning_rate": 4.883692755027865e-07, "loss": -0.0152, "num_tokens": 29973283.0, "reward": 0.3910937547683716, "reward_std": 0.1875857561826706, "rewards/e2e_recall_precision_mixed_reward/mean": 0.3910937488079071, "rewards/e2e_recall_precision_mixed_reward/std": 0.3184710621833801, "sampling/importance_sampling_ratio/max": 1.9934840202331543, "sampling/importance_sampling_ratio/mean": 0.9998901844024658, "sampling/importance_sampling_ratio/min": 0.30794100314378736, "sampling/sampling_logp_difference/max": 1.3923618793487549, "sampling/sampling_logp_difference/mean": 0.017345474287867545, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 2255.6, "completions/max_terminated_length": 2215.8, "completions/mean_length": 1363.684375, "completions/mean_terminated_length": 1348.9182373046874, "completions/min_length": 705.0, "completions/min_terminated_length": 705.0, "entropy": 0.4166957139968872, "epoch": 0.2702702702702703, "frac_reward_zero_std": 0.15, "grad_norm": 0.8560689687728882, "learning_rate": 4.8776350860189e-07, "loss": -0.013, "num_tokens": 30756366.0, "reward": 0.5055729269981384, "reward_std": 0.17977026402950286, "rewards/e2e_recall_precision_mixed_reward/mean": 0.505572932958603, "rewards/e2e_recall_precision_mixed_reward/std": 0.25386624932289126, "sampling/importance_sampling_ratio/max": 1.954643964767456, "sampling/importance_sampling_ratio/mean": 1.000126600265503, "sampling/importance_sampling_ratio/min": 0.35047273635864257, "sampling/sampling_logp_difference/max": 1.0577001810073852, "sampling/sampling_logp_difference/mean": 0.017576563358306884, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021875, "completions/max_length": 2500.0, "completions/max_terminated_length": 2310.6, "completions/mean_length": 1463.453125, "completions/mean_terminated_length": 1441.3449951171874, "completions/min_length": 834.8, "completions/min_terminated_length": 834.8, "entropy": 0.4108987033367157, "epoch": 0.27614571092831963, "frac_reward_zero_std": 0.1, "grad_norm": 0.7477687001228333, "learning_rate": 4.871577417009934e-07, "loss": -0.0453, "num_tokens": 31537571.0, "reward": 0.43791667819023133, "reward_std": 0.205858114361763, "rewards/e2e_recall_precision_mixed_reward/mean": 0.43791667819023133, "rewards/e2e_recall_precision_mixed_reward/std": 0.29302410781383514, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999675035476685, "sampling/importance_sampling_ratio/min": 0.174767720699463, "sampling/sampling_logp_difference/max": 6.906278848648071, "sampling/sampling_logp_difference/mean": 0.01736109107732773, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.028125, "completions/max_length": 2289.8, "completions/max_terminated_length": 2215.6, "completions/mean_length": 1420.634375, "completions/mean_terminated_length": 1395.6822998046875, "completions/min_length": 808.0, "completions/min_terminated_length": 808.0, "entropy": 0.4105457663536072, "epoch": 0.282021151586369, "frac_reward_zero_std": 0.05, "grad_norm": 0.8343913555145264, "learning_rate": 4.865519748000969e-07, "loss": -0.0394, "num_tokens": 32276378.0, "reward": 0.48182291984558107, "reward_std": 0.21514118313789368, "rewards/e2e_recall_precision_mixed_reward/mean": 0.48182291984558107, "rewards/e2e_recall_precision_mixed_reward/std": 0.29770660400390625, "sampling/importance_sampling_ratio/max": 1.9636056900024415, "sampling/importance_sampling_ratio/mean": 0.9999726414680481, "sampling/importance_sampling_ratio/min": 0.3000651866197586, "sampling/sampling_logp_difference/max": 1.286457371711731, "sampling/sampling_logp_difference/mean": 0.017064289748668672, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 2353.2, "completions/max_terminated_length": 2159.0, "completions/mean_length": 1367.946875, "completions/mean_terminated_length": 1353.3779052734376, "completions/min_length": 807.0, "completions/min_terminated_length": 807.0, "entropy": 0.3979348599910736, "epoch": 0.2878965922444183, "frac_reward_zero_std": 0.05, "grad_norm": 0.831149160861969, "learning_rate": 4.859462078992004e-07, "loss": -0.0077, "num_tokens": 33040329.0, "reward": 0.4345312714576721, "reward_std": 0.2075097978115082, "rewards/e2e_recall_precision_mixed_reward/mean": 0.4345312714576721, "rewards/e2e_recall_precision_mixed_reward/std": 0.29992562234401704, "sampling/importance_sampling_ratio/max": 1.947805905342102, "sampling/importance_sampling_ratio/mean": 0.9999383926391602, "sampling/importance_sampling_ratio/min": 0.2707966983318329, "sampling/sampling_logp_difference/max": 1.6397278547286986, "sampling/sampling_logp_difference/mean": 0.016731590032577515, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 2313.4, "completions/max_terminated_length": 2260.0, "completions/mean_length": 1422.896875, "completions/mean_terminated_length": 1419.680224609375, "completions/min_length": 828.2, "completions/min_terminated_length": 828.2, "entropy": 0.40560716986656187, "epoch": 0.2937720329024677, "frac_reward_zero_std": 0.15, "grad_norm": 0.8927066922187805, "learning_rate": 4.853404409983038e-07, "loss": 0.0072, "num_tokens": 33791348.0, "reward": 0.5130208373069763, "reward_std": 0.1809627652168274, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5130208373069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.2381391167640686, "sampling/importance_sampling_ratio/max": 1.878486657142639, "sampling/importance_sampling_ratio/mean": 1.0000173091888427, "sampling/importance_sampling_ratio/min": 0.16526238694787027, "sampling/sampling_logp_difference/max": 2.048710656166077, "sampling/sampling_logp_difference/mean": 0.016782762855291365, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021875, "completions/max_length": 2481.2, "completions/max_terminated_length": 2422.4, "completions/mean_length": 1547.384375, "completions/mean_terminated_length": 1526.5686279296874, "completions/min_length": 923.0, "completions/min_terminated_length": 923.0, "entropy": 0.423099547624588, "epoch": 0.299647473560517, "frac_reward_zero_std": 0.1, "grad_norm": 0.771557092666626, "learning_rate": 4.847346740974073e-07, "loss": -0.0087, "num_tokens": 34596563.0, "reward": 0.46609375476837156, "reward_std": 0.22107117474079133, "rewards/e2e_recall_precision_mixed_reward/mean": 0.46609376072883607, "rewards/e2e_recall_precision_mixed_reward/std": 0.30911803245544434, "sampling/importance_sampling_ratio/max": 1.9809560775756836, "sampling/importance_sampling_ratio/mean": 1.0000070095062257, "sampling/importance_sampling_ratio/min": 0.26680448576807975, "sampling/sampling_logp_difference/max": 1.5669462442398072, "sampling/sampling_logp_difference/mean": 0.017237287014722824, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01875, "completions/max_length": 2367.2, "completions/max_terminated_length": 2233.0, "completions/mean_length": 1370.38125, "completions/mean_terminated_length": 1349.311376953125, "completions/min_length": 832.6, "completions/min_terminated_length": 832.6, "entropy": 0.4151655673980713, "epoch": 0.30552291421856637, "frac_reward_zero_std": 0.2, "grad_norm": 0.8248306512832642, "learning_rate": 4.841289071965108e-07, "loss": 0.0028, "num_tokens": 35339573.0, "reward": 0.35020835101604464, "reward_std": 0.1863805890083313, "rewards/e2e_recall_precision_mixed_reward/mean": 0.35020834505558013, "rewards/e2e_recall_precision_mixed_reward/std": 0.3319753110408783, "sampling/importance_sampling_ratio/max": 1.9679210186004639, "sampling/importance_sampling_ratio/mean": 0.9999267935752869, "sampling/importance_sampling_ratio/min": 0.3026794917881489, "sampling/sampling_logp_difference/max": 1.417945671081543, "sampling/sampling_logp_difference/mean": 0.017283813282847404, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06875, "completions/max_length": 2469.0, "completions/max_terminated_length": 2381.2, "completions/mean_length": 1549.65, "completions/mean_terminated_length": 1483.4290283203125, "completions/min_length": 774.2, "completions/min_terminated_length": 774.2, "entropy": 0.42785446643829345, "epoch": 0.31139835487661577, "frac_reward_zero_std": 0.1, "grad_norm": 0.6696219444274902, "learning_rate": 4.835231402956143e-07, "loss": -0.0622, "num_tokens": 36133117.0, "reward": 0.4380729258060455, "reward_std": 0.207410229742527, "rewards/e2e_recall_precision_mixed_reward/mean": 0.4380729258060455, "rewards/e2e_recall_precision_mixed_reward/std": 0.3342203199863434, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000066590309143, "sampling/importance_sampling_ratio/min": 0.26250506937503815, "sampling/sampling_logp_difference/max": 1.4055633783340453, "sampling/sampling_logp_difference/mean": 0.017540974915027617, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2450.4, "completions/max_terminated_length": 2289.0, "completions/mean_length": 1445.24375, "completions/mean_terminated_length": 1428.3465087890625, "completions/min_length": 873.8, "completions/min_terminated_length": 873.8, "entropy": 0.42337934374809266, "epoch": 0.3172737955346651, "frac_reward_zero_std": 0.05, "grad_norm": 0.6972928643226624, "learning_rate": 4.829173733947177e-07, "loss": -0.0112, "num_tokens": 36890567.0, "reward": 0.4680208325386047, "reward_std": 0.20582786798477173, "rewards/e2e_recall_precision_mixed_reward/mean": 0.4680208325386047, "rewards/e2e_recall_precision_mixed_reward/std": 0.2813736617565155, "sampling/importance_sampling_ratio/max": 1.9894936561584473, "sampling/importance_sampling_ratio/mean": 1.0000147461891173, "sampling/importance_sampling_ratio/min": 0.2970141440629959, "sampling/sampling_logp_difference/max": 1.4801963686943054, "sampling/sampling_logp_difference/mean": 0.01727181263267994, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2370.0, "completions/max_terminated_length": 2325.6, "completions/mean_length": 1453.1125, "completions/mean_terminated_length": 1439.437353515625, "completions/min_length": 859.2, "completions/min_terminated_length": 859.2, "entropy": 0.43045341968536377, "epoch": 0.32314923619271446, "frac_reward_zero_std": 0.25, "grad_norm": 0.6425484418869019, "learning_rate": 4.823116064938211e-07, "loss": 0.0202, "num_tokens": 37673687.0, "reward": 0.35296875834465025, "reward_std": 0.18358486890792847, "rewards/e2e_recall_precision_mixed_reward/mean": 0.35296875834465025, "rewards/e2e_recall_precision_mixed_reward/std": 0.2992075264453888, "sampling/importance_sampling_ratio/max": 1.9819918394088745, "sampling/importance_sampling_ratio/mean": 1.0000043153762816, "sampling/importance_sampling_ratio/min": 0.28931107074022294, "sampling/sampling_logp_difference/max": 1.379153299331665, "sampling/sampling_logp_difference/mean": 0.017378567531704903, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 2150.0, "completions/max_terminated_length": 2112.6, "completions/mean_length": 1302.640625, "completions/mean_terminated_length": 1295.7153076171876, "completions/min_length": 780.0, "completions/min_terminated_length": 780.0, "entropy": 0.39050028920173646, "epoch": 0.3290246768507638, "frac_reward_zero_std": 0.15, "grad_norm": 0.6917908787727356, "learning_rate": 4.817058395929246e-07, "loss": -0.0274, "num_tokens": 38388284.0, "reward": 0.5204687595367432, "reward_std": 0.19263336360454558, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5204687595367432, "rewards/e2e_recall_precision_mixed_reward/std": 0.2920966506004333, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999744057655334, "sampling/importance_sampling_ratio/min": 0.3137810334563255, "sampling/sampling_logp_difference/max": 1.2811630487442016, "sampling/sampling_logp_difference/mean": 0.016145946830511092, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 2261.2, "completions/max_terminated_length": 2259.8, "completions/mean_length": 1407.540625, "completions/mean_terminated_length": 1404.1890625, "completions/min_length": 876.2, "completions/min_terminated_length": 876.2, "entropy": 0.40756009221076966, "epoch": 0.33490011750881316, "frac_reward_zero_std": 0.15, "grad_norm": 0.7234519720077515, "learning_rate": 4.81100072692028e-07, "loss": 0.0238, "num_tokens": 39186805.0, "reward": 0.4428645968437195, "reward_std": 0.1544080436229706, "rewards/e2e_recall_precision_mixed_reward/mean": 0.4428645968437195, "rewards/e2e_recall_precision_mixed_reward/std": 0.23549820780754088, "sampling/importance_sampling_ratio/max": 1.9689576625823975, "sampling/importance_sampling_ratio/mean": 0.9999301791191101, "sampling/importance_sampling_ratio/min": 0.1660400189459324, "sampling/sampling_logp_difference/max": 2.068434953689575, "sampling/sampling_logp_difference/mean": 0.016894153505563735, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01875, "completions/max_length": 2325.8, "completions/max_terminated_length": 2113.4, "completions/mean_length": 1382.234375, "completions/mean_terminated_length": 1360.899755859375, "completions/min_length": 834.0, "completions/min_terminated_length": 834.0, "entropy": 0.4158647537231445, "epoch": 0.3407755581668625, "frac_reward_zero_std": 0.15, "grad_norm": 0.5713046789169312, "learning_rate": 4.804943057911315e-07, "loss": -0.0197, "num_tokens": 39935944.0, "reward": 0.36718750596046446, "reward_std": 0.16592961102724074, "rewards/e2e_recall_precision_mixed_reward/mean": 0.36718750596046446, "rewards/e2e_recall_precision_mixed_reward/std": 0.299328675866127, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999416589736938, "sampling/importance_sampling_ratio/min": 0.3071061834692955, "sampling/sampling_logp_difference/max": 1.781144905090332, "sampling/sampling_logp_difference/mean": 0.01737259849905968, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 2227.0, "completions/max_terminated_length": 2149.0, "completions/mean_length": 1325.434375, "completions/mean_terminated_length": 1311.261669921875, "completions/min_length": 827.2, "completions/min_terminated_length": 827.2, "entropy": 0.40315585732460024, "epoch": 0.34665099882491185, "frac_reward_zero_std": 0.05, "grad_norm": 0.8399536609649658, "learning_rate": 4.79888538890235e-07, "loss": -0.0172, "num_tokens": 40637027.0, "reward": 0.5668229222297668, "reward_std": 0.238536736369133, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5668229222297668, "rewards/e2e_recall_precision_mixed_reward/std": 0.3036984860897064, "sampling/importance_sampling_ratio/max": 1.9938727378845216, "sampling/importance_sampling_ratio/mean": 0.9999548077583313, "sampling/importance_sampling_ratio/min": 0.2874012000946095, "sampling/sampling_logp_difference/max": 2.626276063919067, "sampling/sampling_logp_difference/mean": 0.016770557686686515, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 2359.0, "completions/max_terminated_length": 2342.8, "completions/mean_length": 1378.61875, "completions/mean_terminated_length": 1365.0980712890625, "completions/min_length": 871.0, "completions/min_terminated_length": 871.0, "entropy": 0.40095179080963134, "epoch": 0.3525264394829612, "frac_reward_zero_std": 0.25, "grad_norm": 0.8633328676223755, "learning_rate": 4.792827719893385e-07, "loss": 0.0037, "num_tokens": 41401417.0, "reward": 0.4402604281902313, "reward_std": 0.18189195394515992, "rewards/e2e_recall_precision_mixed_reward/mean": 0.4402604281902313, "rewards/e2e_recall_precision_mixed_reward/std": 0.3080439865589142, "sampling/importance_sampling_ratio/max": 1.9228517293930054, "sampling/importance_sampling_ratio/mean": 0.9999574422836304, "sampling/importance_sampling_ratio/min": 0.27484258711338044, "sampling/sampling_logp_difference/max": 1.4705661535263062, "sampling/sampling_logp_difference/mean": 0.016645203903317453, "step": 300 }, { "epoch": 0.3525264394829612, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.00875, "eval_completions/max_length": 2255.24, "eval_completions/max_terminated_length": 2190.56, "eval_completions/mean_length": 1306.115, "eval_completions/mean_terminated_length": 1296.024951171875, "eval_completions/min_length": 795.84, "eval_completions/min_terminated_length": 795.84, "eval_entropy": 0.39472726941108705, "eval_frac_reward_zero_std": 0.14, "eval_loss": 0.0051555633544921875, "eval_num_tokens": 41401417.0, "eval_reward": 0.4641666793823242, "eval_reward_std": 0.1855441576242447, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.46416668176651, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.3040527403354645, "eval_runtime": 591.545, "eval_samples_per_second": 0.169, "eval_sampling/importance_sampling_ratio/max": 1.9726946783065795, "eval_sampling/importance_sampling_ratio/mean": 1.000031213760376, "eval_sampling/importance_sampling_ratio/min": 0.2702025346830487, "eval_sampling/sampling_logp_difference/max": 1.6381762075424193, "eval_sampling/sampling_logp_difference/mean": 0.016649646908044814, "eval_steps_per_second": 0.003, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01875, "completions/max_length": 2440.6, "completions/max_terminated_length": 2256.6, "completions/mean_length": 1346.771875, "completions/mean_terminated_length": 1326.0095458984374, "completions/min_length": 766.4, "completions/min_terminated_length": 766.4, "entropy": 0.3954951822757721, "epoch": 0.3584018801410106, "frac_reward_zero_std": 0.1, "grad_norm": 0.6830638647079468, "learning_rate": 4.78677005088442e-07, "loss": -0.0205, "num_tokens": 42149976.0, "reward": 0.5190104305744171, "reward_std": 0.2403048187494278, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5190104246139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.32388275265693667, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999425649642945, "sampling/importance_sampling_ratio/min": 0.2714868515729904, "sampling/sampling_logp_difference/max": 1.3850975036621094, "sampling/sampling_logp_difference/mean": 0.01655212976038456, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009375, "completions/max_length": 2315.6, "completions/max_terminated_length": 2119.4, "completions/mean_length": 1415.84375, "completions/mean_terminated_length": 1406.0834228515625, "completions/min_length": 879.0, "completions/min_terminated_length": 879.0, "entropy": 0.3955690324306488, "epoch": 0.36427732079905994, "frac_reward_zero_std": 0.2, "grad_norm": 0.48863622546195984, "learning_rate": 4.780712381875454e-07, "loss": 0.0052, "num_tokens": 42944954.0, "reward": 0.36197916865348817, "reward_std": 0.17809403240680693, "rewards/e2e_recall_precision_mixed_reward/mean": 0.3619791746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.2854657083749771, "sampling/importance_sampling_ratio/max": 1.989390754699707, "sampling/importance_sampling_ratio/mean": 0.9999927639961242, "sampling/importance_sampling_ratio/min": 0.3247060298919678, "sampling/sampling_logp_difference/max": 1.2260807275772094, "sampling/sampling_logp_difference/mean": 0.016654501855373382, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2313.8, "completions/max_terminated_length": 2162.2, "completions/mean_length": 1346.1375, "completions/mean_terminated_length": 1327.4448486328124, "completions/min_length": 794.2, "completions/min_terminated_length": 794.2, "entropy": 0.405675995349884, "epoch": 0.3701527614571093, "frac_reward_zero_std": 0.1, "grad_norm": 0.6851405501365662, "learning_rate": 4.774654712866488e-07, "loss": -0.0064, "num_tokens": 43692738.0, "reward": 0.5561979353427887, "reward_std": 0.20282128155231477, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5561979293823243, "rewards/e2e_recall_precision_mixed_reward/std": 0.3183633327484131, "sampling/importance_sampling_ratio/max": 1.9703486680984497, "sampling/importance_sampling_ratio/mean": 0.9999482989311218, "sampling/importance_sampling_ratio/min": 0.26732968389987943, "sampling/sampling_logp_difference/max": 1.3939155101776124, "sampling/sampling_logp_difference/mean": 0.016717956587672233, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2268.8, "completions/max_terminated_length": 2231.6, "completions/mean_length": 1383.540625, "completions/mean_terminated_length": 1367.305126953125, "completions/min_length": 839.8, "completions/min_terminated_length": 839.8, "entropy": 0.4027079105377197, "epoch": 0.37602820211515864, "frac_reward_zero_std": 0.05, "grad_norm": 0.903359591960907, "learning_rate": 4.768597043857523e-07, "loss": -0.0064, "num_tokens": 44430635.0, "reward": 0.45427083373069765, "reward_std": 0.19985013008117675, "rewards/e2e_recall_precision_mixed_reward/mean": 0.45427083373069765, "rewards/e2e_recall_precision_mixed_reward/std": 0.26199466586112974, "sampling/importance_sampling_ratio/max": 1.9889472484588624, "sampling/importance_sampling_ratio/mean": 1.0000965476036072, "sampling/importance_sampling_ratio/min": 0.2858219683170319, "sampling/sampling_logp_difference/max": 1.5289599657058717, "sampling/sampling_logp_difference/mean": 0.016677992790937422, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 2299.2, "completions/max_terminated_length": 2297.4, "completions/mean_length": 1311.9625, "completions/mean_terminated_length": 1308.5442626953125, "completions/min_length": 867.2, "completions/min_terminated_length": 867.2, "entropy": 0.411800742149353, "epoch": 0.381903642773208, "frac_reward_zero_std": 0.1, "grad_norm": 0.8514150381088257, "learning_rate": 4.7625393748485583e-07, "loss": 0.0109, "num_tokens": 45151851.0, "reward": 0.48619791865348816, "reward_std": 0.19733970016241073, "rewards/e2e_recall_precision_mixed_reward/mean": 0.48619791865348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.310748428106308, "sampling/importance_sampling_ratio/max": 1.9813976764678956, "sampling/importance_sampling_ratio/mean": 1.000037384033203, "sampling/importance_sampling_ratio/min": 0.2959585070610046, "sampling/sampling_logp_difference/max": 1.2915854692459106, "sampling/sampling_logp_difference/mean": 0.017146169394254684, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 2122.4, "completions/max_terminated_length": 2083.8, "completions/mean_length": 1377.2875, "completions/mean_terminated_length": 1365.130322265625, "completions/min_length": 872.2, "completions/min_terminated_length": 872.2, "entropy": 0.39033161997795107, "epoch": 0.38777908343125733, "frac_reward_zero_std": 0.15, "grad_norm": 0.7350701093673706, "learning_rate": 4.7564817058395926e-07, "loss": -0.009, "num_tokens": 45916567.0, "reward": 0.39609376192092893, "reward_std": 0.19951523691415787, "rewards/e2e_recall_precision_mixed_reward/mean": 0.39609376192092893, "rewards/e2e_recall_precision_mixed_reward/std": 0.3023851901292801, "sampling/importance_sampling_ratio/max": 1.9538982629776, "sampling/importance_sampling_ratio/mean": 0.9999711871147156, "sampling/importance_sampling_ratio/min": 0.15197787082288414, "sampling/sampling_logp_difference/max": 2.7049013137817384, "sampling/sampling_logp_difference/mean": 0.01634741071611643, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 2365.0, "completions/max_terminated_length": 2233.2, "completions/mean_length": 1297.45, "completions/mean_terminated_length": 1281.5479736328125, "completions/min_length": 830.0, "completions/min_terminated_length": 830.0, "entropy": 0.40079636573791505, "epoch": 0.3936545240893067, "frac_reward_zero_std": 0.3, "grad_norm": 0.6987754702568054, "learning_rate": 4.7504240368306275e-07, "loss": 0.0049, "num_tokens": 46645495.0, "reward": 0.49000000655651094, "reward_std": 0.14566230401396751, "rewards/e2e_recall_precision_mixed_reward/mean": 0.49000000953674316, "rewards/e2e_recall_precision_mixed_reward/std": 0.23847460746765137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000020718574524, "sampling/importance_sampling_ratio/min": 0.24331406950950624, "sampling/sampling_logp_difference/max": 1.6468490839004517, "sampling/sampling_logp_difference/mean": 0.01682475283741951, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2367.6, "completions/max_terminated_length": 2156.2, "completions/mean_length": 1326.51875, "completions/mean_terminated_length": 1309.7556640625, "completions/min_length": 867.0, "completions/min_terminated_length": 867.0, "entropy": 0.3823336660861969, "epoch": 0.399529964747356, "frac_reward_zero_std": 0.1, "grad_norm": 0.6209218502044678, "learning_rate": 4.7443663678216624e-07, "loss": 0.01, "num_tokens": 47367257.0, "reward": 0.5150520920753479, "reward_std": 0.22551013827323912, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5150520920753479, "rewards/e2e_recall_precision_mixed_reward/std": 0.3198787569999695, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999925410747528, "sampling/importance_sampling_ratio/min": 0.36856586337089536, "sampling/sampling_logp_difference/max": 1.3121527194976808, "sampling/sampling_logp_difference/mean": 0.01604050975292921, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1965.8, "completions/max_terminated_length": 1965.8, "completions/mean_length": 1185.409375, "completions/mean_terminated_length": 1185.409375, "completions/min_length": 730.8, "completions/min_terminated_length": 730.8, "entropy": 0.38586640954017637, "epoch": 0.40540540540540543, "frac_reward_zero_std": 0.05, "grad_norm": 1.4014785289764404, "learning_rate": 4.738308698812697e-07, "loss": 0.0466, "num_tokens": 48037180.0, "reward": 0.5460416734218597, "reward_std": 0.19236631989479064, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5460416972637177, "rewards/e2e_recall_precision_mixed_reward/std": 0.292988184094429, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999979043006897, "sampling/importance_sampling_ratio/min": 0.29654269516468046, "sampling/sampling_logp_difference/max": 1.231632113456726, "sampling/sampling_logp_difference/mean": 0.016064048185944556, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009375, "completions/max_length": 2091.2, "completions/max_terminated_length": 2082.6, "completions/mean_length": 1254.175, "completions/mean_terminated_length": 1243.588623046875, "completions/min_length": 818.2, "completions/min_terminated_length": 818.2, "entropy": 0.3805388808250427, "epoch": 0.4112808460634548, "frac_reward_zero_std": 0.15, "grad_norm": 0.7526155710220337, "learning_rate": 4.7322510298037317e-07, "loss": 0.0057, "num_tokens": 48776344.0, "reward": 0.4618229269981384, "reward_std": 0.15769713521003723, "rewards/e2e_recall_precision_mixed_reward/mean": 0.4618229269981384, "rewards/e2e_recall_precision_mixed_reward/std": 0.27449490427970885, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000930905342102, "sampling/importance_sampling_ratio/min": 0.2510767489671707, "sampling/sampling_logp_difference/max": 2.2086389303207397, "sampling/sampling_logp_difference/mean": 0.015983594954013823, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 2081.6, "completions/max_terminated_length": 2033.8, "completions/mean_length": 1256.4, "completions/mean_terminated_length": 1252.8534912109376, "completions/min_length": 803.2, "completions/min_terminated_length": 803.2, "entropy": 0.36754541397094725, "epoch": 0.4171562867215041, "frac_reward_zero_std": 0.1, "grad_norm": 0.5367698669433594, "learning_rate": 4.7261933607947655e-07, "loss": -0.013, "num_tokens": 49508964.0, "reward": 0.5018229305744171, "reward_std": 0.1739724576473236, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5018229305744171, "rewards/e2e_recall_precision_mixed_reward/std": 0.2954780399799347, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998829245567322, "sampling/importance_sampling_ratio/min": 0.27626035958528516, "sampling/sampling_logp_difference/max": 1.4486989021301269, "sampling/sampling_logp_difference/mean": 0.015474473871290684, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 2188.4, "completions/max_terminated_length": 2184.4, "completions/mean_length": 1234.028125, "completions/mean_terminated_length": 1226.1088623046876, "completions/min_length": 753.4, "completions/min_terminated_length": 753.4, "entropy": 0.38073245286941526, "epoch": 0.42303172737955347, "frac_reward_zero_std": 0.25, "grad_norm": 0.7556249499320984, "learning_rate": 4.7201356917858004e-07, "loss": -0.0105, "num_tokens": 50218789.0, "reward": 0.42156251668930056, "reward_std": 0.1456248864531517, "rewards/e2e_recall_precision_mixed_reward/mean": 0.421562522649765, "rewards/e2e_recall_precision_mixed_reward/std": 0.23054299950599672, "sampling/importance_sampling_ratio/max": 1.9770933151245118, "sampling/importance_sampling_ratio/mean": 1.0000459671020507, "sampling/importance_sampling_ratio/min": 0.3449200510978699, "sampling/sampling_logp_difference/max": 1.3223530054092407, "sampling/sampling_logp_difference/mean": 0.0159611064940691, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1993.4, "completions/max_terminated_length": 1993.4, "completions/mean_length": 1205.571875, "completions/mean_terminated_length": 1205.571875, "completions/min_length": 794.4, "completions/min_terminated_length": 794.4, "entropy": 0.390489786863327, "epoch": 0.4289071680376028, "frac_reward_zero_std": 0.05, "grad_norm": 0.9579271078109741, "learning_rate": 4.7140780227768353e-07, "loss": 0.0223, "num_tokens": 50928908.0, "reward": 0.537968760728836, "reward_std": 0.21120007038116456, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5379687666893005, "rewards/e2e_recall_precision_mixed_reward/std": 0.2601155489683151, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999605894088746, "sampling/importance_sampling_ratio/min": 0.23600251823663712, "sampling/sampling_logp_difference/max": 1.7278624057769776, "sampling/sampling_logp_difference/mean": 0.016414126753807066, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1708.0, "completions/max_terminated_length": 1708.0, "completions/mean_length": 1030.24375, "completions/mean_terminated_length": 1030.24375, "completions/min_length": 696.4, "completions/min_terminated_length": 696.4, "entropy": 0.3575406074523926, "epoch": 0.43478260869565216, "frac_reward_zero_std": 0.2, "grad_norm": 0.7329716086387634, "learning_rate": 4.7080203537678697e-07, "loss": 0.0058, "num_tokens": 51567930.0, "reward": 0.5391145884990692, "reward_std": 0.19752640128135682, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5391145884990692, "rewards/e2e_recall_precision_mixed_reward/std": 0.27296304106712344, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999308943748474, "sampling/importance_sampling_ratio/min": 0.2641163617372513, "sampling/sampling_logp_difference/max": 1.4609861969947815, "sampling/sampling_logp_difference/mean": 0.015179168432950974, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1568.4, "completions/max_terminated_length": 1568.4, "completions/mean_length": 920.165625, "completions/mean_terminated_length": 920.165625, "completions/min_length": 506.6, "completions/min_terminated_length": 506.6, "entropy": 0.39158068895339965, "epoch": 0.4406580493537015, "frac_reward_zero_std": 0.15, "grad_norm": 0.8551925420761108, "learning_rate": 4.7019626847589046e-07, "loss": 0.0294, "num_tokens": 52161087.0, "reward": 0.5165625095367432, "reward_std": 0.19143509566783906, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5165625095367432, "rewards/e2e_recall_precision_mixed_reward/std": 0.3149138242006302, "sampling/importance_sampling_ratio/max": 1.9255353212356567, "sampling/importance_sampling_ratio/mean": 1.0001346826553346, "sampling/importance_sampling_ratio/min": 0.2629414364695549, "sampling/sampling_logp_difference/max": 1.4847426176071168, "sampling/sampling_logp_difference/mean": 0.016572076827287674, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1716.4, "completions/max_terminated_length": 1716.4, "completions/mean_length": 999.54375, "completions/mean_terminated_length": 999.54375, "completions/min_length": 445.2, "completions/min_terminated_length": 445.2, "entropy": 0.3913616418838501, "epoch": 0.44653349001175086, "frac_reward_zero_std": 0.1, "grad_norm": 0.8592411279678345, "learning_rate": 4.695905015749939e-07, "loss": 0.021, "num_tokens": 52813725.0, "reward": 0.5802604496479035, "reward_std": 0.22083891928195953, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5802604377269744, "rewards/e2e_recall_precision_mixed_reward/std": 0.3036410689353943, "sampling/importance_sampling_ratio/max": 1.9878347158432006, "sampling/importance_sampling_ratio/mean": 0.9998912930488586, "sampling/importance_sampling_ratio/min": 0.2878791332244873, "sampling/sampling_logp_difference/max": 1.5880217552185059, "sampling/sampling_logp_difference/mean": 0.01670288797467947, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1729.8, "completions/max_terminated_length": 1729.8, "completions/mean_length": 996.284375, "completions/mean_terminated_length": 996.284375, "completions/min_length": 546.0, "completions/min_terminated_length": 546.0, "entropy": 0.3791182518005371, "epoch": 0.45240893066980026, "frac_reward_zero_std": 0.1, "grad_norm": 1.009487271308899, "learning_rate": 4.689847346740974e-07, "loss": 0.0119, "num_tokens": 53446856.0, "reward": 0.567864590883255, "reward_std": 0.2009609282016754, "rewards/e2e_recall_precision_mixed_reward/mean": 0.567864590883255, "rewards/e2e_recall_precision_mixed_reward/std": 0.331317538022995, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000531435012818, "sampling/importance_sampling_ratio/min": 0.36831892728805543, "sampling/sampling_logp_difference/max": 1.1540207147598267, "sampling/sampling_logp_difference/mean": 0.01622724235057831, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1572.2, "completions/max_terminated_length": 1572.2, "completions/mean_length": 987.134375, "completions/mean_terminated_length": 987.134375, "completions/min_length": 575.0, "completions/min_terminated_length": 575.0, "entropy": 0.3846351742744446, "epoch": 0.4582843713278496, "frac_reward_zero_std": 0.1, "grad_norm": 1.2462704181671143, "learning_rate": 4.6837896777320087e-07, "loss": -0.0292, "num_tokens": 54089075.0, "reward": 0.5090625166893006, "reward_std": 0.2253621369600296, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5090625166893006, "rewards/e2e_recall_precision_mixed_reward/std": 0.32490702867507937, "sampling/importance_sampling_ratio/max": 1.8669166564941406, "sampling/importance_sampling_ratio/mean": 1.0000983238220216, "sampling/importance_sampling_ratio/min": 0.2756506517529488, "sampling/sampling_logp_difference/max": 1.474560058116913, "sampling/sampling_logp_difference/mean": 0.016727247275412083, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1904.6, "completions/max_terminated_length": 1772.6, "completions/mean_length": 1097.10625, "completions/mean_terminated_length": 1092.751806640625, "completions/min_length": 723.4, "completions/min_terminated_length": 723.4, "entropy": 0.37217140197753906, "epoch": 0.46415981198589895, "frac_reward_zero_std": 0.15, "grad_norm": 0.9669972062110901, "learning_rate": 4.677732008723043e-07, "loss": -0.0141, "num_tokens": 54781809.0, "reward": 0.4870833456516266, "reward_std": 0.15707829147577285, "rewards/e2e_recall_precision_mixed_reward/mean": 0.4870833456516266, "rewards/e2e_recall_precision_mixed_reward/std": 0.30188499987125395, "sampling/importance_sampling_ratio/max": 1.9380834102630615, "sampling/importance_sampling_ratio/mean": 0.9999426603317261, "sampling/importance_sampling_ratio/min": 0.24590765461325645, "sampling/sampling_logp_difference/max": 1.9788538694381714, "sampling/sampling_logp_difference/mean": 0.01591875497251749, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1558.6, "completions/max_terminated_length": 1558.6, "completions/mean_length": 1044.515625, "completions/mean_terminated_length": 1044.515625, "completions/min_length": 680.8, "completions/min_terminated_length": 680.8, "entropy": 0.3540132224559784, "epoch": 0.4700352526439483, "frac_reward_zero_std": 0.2, "grad_norm": 0.9251359105110168, "learning_rate": 4.671674339714078e-07, "loss": -0.0023, "num_tokens": 55438886.0, "reward": 0.5378125071525574, "reward_std": 0.18125876784324646, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5378125071525574, "rewards/e2e_recall_precision_mixed_reward/std": 0.30963089168071745, "sampling/importance_sampling_ratio/max": 1.9805966854095458, "sampling/importance_sampling_ratio/mean": 0.9999330878257752, "sampling/importance_sampling_ratio/min": 0.2766235023736954, "sampling/sampling_logp_difference/max": 1.4273133754730225, "sampling/sampling_logp_difference/mean": 0.015250829048454762, "step": 400 }, { "epoch": 0.4700352526439483, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1663.24, "eval_completions/max_terminated_length": 1663.24, "eval_completions/mean_length": 1068.386875, "eval_completions/mean_terminated_length": 1068.386875, "eval_completions/min_length": 682.2, "eval_completions/min_terminated_length": 682.2, "eval_entropy": 0.37045517563819885, "eval_frac_reward_zero_std": 0.18, "eval_loss": 0.010637059807777405, "eval_num_tokens": 55438886.0, "eval_reward": 0.48254167914390567, "eval_reward_std": 0.171167514026165, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.4825416815280914, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.29884088337421416, "eval_runtime": 435.7442, "eval_samples_per_second": 0.229, "eval_sampling/importance_sampling_ratio/max": 1.9646133232116698, "eval_sampling/importance_sampling_ratio/mean": 0.999952335357666, "eval_sampling/importance_sampling_ratio/min": 0.3115068358182907, "eval_sampling/sampling_logp_difference/max": 1.2885041189193727, "eval_sampling/sampling_logp_difference/mean": 0.015734767876565456, "eval_steps_per_second": 0.005, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1792.6, "completions/max_terminated_length": 1771.2, "completions/mean_length": 1133.459375, "completions/mean_terminated_length": 1129.54833984375, "completions/min_length": 769.6, "completions/min_terminated_length": 769.6, "entropy": 0.361786413192749, "epoch": 0.47591069330199764, "frac_reward_zero_std": 0.25, "grad_norm": 0.6992851495742798, "learning_rate": 4.6656166707051123e-07, "loss": 0.0071, "num_tokens": 56134341.0, "reward": 0.5578646063804626, "reward_std": 0.1855187177658081, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5578646123409271, "rewards/e2e_recall_precision_mixed_reward/std": 0.28627926409244536, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000202178955078, "sampling/importance_sampling_ratio/min": 0.3811205953359604, "sampling/sampling_logp_difference/max": 1.274878692626953, "sampling/sampling_logp_difference/mean": 0.015369786508381366, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1997.2, "completions/max_terminated_length": 1997.2, "completions/mean_length": 1159.346875, "completions/mean_terminated_length": 1159.346875, "completions/min_length": 732.4, "completions/min_terminated_length": 732.4, "entropy": 0.3553011953830719, "epoch": 0.481786133960047, "frac_reward_zero_std": 0.2, "grad_norm": 1.205419898033142, "learning_rate": 4.659559001696147e-07, "loss": 0.0217, "num_tokens": 56814612.0, "reward": 0.5170312762260437, "reward_std": 0.17431987226009368, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5170312762260437, "rewards/e2e_recall_precision_mixed_reward/std": 0.2719025552272797, "sampling/importance_sampling_ratio/max": 1.979778790473938, "sampling/importance_sampling_ratio/mean": 0.9998809933662415, "sampling/importance_sampling_ratio/min": 0.3756587505340576, "sampling/sampling_logp_difference/max": 1.0862587213516235, "sampling/sampling_logp_difference/mean": 0.014951322041451932, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1706.0, "completions/max_terminated_length": 1706.0, "completions/mean_length": 1093.56875, "completions/mean_terminated_length": 1093.56875, "completions/min_length": 690.4, "completions/min_terminated_length": 690.4, "entropy": 0.36290356516838074, "epoch": 0.48766157461809634, "frac_reward_zero_std": 0.2, "grad_norm": 1.1141079664230347, "learning_rate": 4.653501332687182e-07, "loss": 0.0016, "num_tokens": 57490778.0, "reward": 0.47807292342185975, "reward_std": 0.16014991998672484, "rewards/e2e_recall_precision_mixed_reward/mean": 0.47807292342185975, "rewards/e2e_recall_precision_mixed_reward/std": 0.29495506882667544, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000198245048524, "sampling/importance_sampling_ratio/min": 0.29137383252382276, "sampling/sampling_logp_difference/max": 1.4087148904800415, "sampling/sampling_logp_difference/mean": 0.015364770777523518, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1689.2, "completions/max_terminated_length": 1689.2, "completions/mean_length": 1061.3875, "completions/mean_terminated_length": 1061.3875, "completions/min_length": 670.6, "completions/min_terminated_length": 670.6, "entropy": 0.3800378322601318, "epoch": 0.4935370152761457, "frac_reward_zero_std": 0.15, "grad_norm": 0.8299875855445862, "learning_rate": 4.6474436636782165e-07, "loss": 0.002, "num_tokens": 58153094.0, "reward": 0.5261979281902314, "reward_std": 0.15056960582733153, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5261979281902314, "rewards/e2e_recall_precision_mixed_reward/std": 0.28166911005973816, "sampling/importance_sampling_ratio/max": 1.9816663026809693, "sampling/importance_sampling_ratio/mean": 1.0000874996185303, "sampling/importance_sampling_ratio/min": 0.35656355023384095, "sampling/sampling_logp_difference/max": 1.1156534433364869, "sampling/sampling_logp_difference/mean": 0.015891117975115776, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1809.4, "completions/max_terminated_length": 1809.4, "completions/mean_length": 1166.70625, "completions/mean_terminated_length": 1166.70625, "completions/min_length": 792.6, "completions/min_terminated_length": 792.6, "entropy": 0.3703398644924164, "epoch": 0.4994124559341951, "frac_reward_zero_std": 0.45, "grad_norm": 0.7483147978782654, "learning_rate": 4.6413859946692514e-07, "loss": 0.0159, "num_tokens": 58863832.0, "reward": 0.4547916650772095, "reward_std": 0.09739691466093063, "rewards/e2e_recall_precision_mixed_reward/mean": 0.45479167699813844, "rewards/e2e_recall_precision_mixed_reward/std": 0.23734066933393477, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999337792396545, "sampling/importance_sampling_ratio/min": 0.31920480728149414, "sampling/sampling_logp_difference/max": 1.3080387592315674, "sampling/sampling_logp_difference/mean": 0.015443827025592327, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009375, "completions/max_length": 1885.2, "completions/max_terminated_length": 1781.4, "completions/mean_length": 1171.234375, "completions/mean_terminated_length": 1158.0830810546875, "completions/min_length": 709.6, "completions/min_terminated_length": 709.6, "entropy": 0.36316835284233095, "epoch": 0.5052878965922444, "frac_reward_zero_std": 0.1, "grad_norm": 0.9235474467277527, "learning_rate": 4.635328325660286e-07, "loss": -0.0112, "num_tokens": 59545799.0, "reward": 0.5247916758060456, "reward_std": 0.17066848278045654, "rewards/e2e_recall_precision_mixed_reward/mean": 0.52479168176651, "rewards/e2e_recall_precision_mixed_reward/std": 0.2839739441871643, "sampling/importance_sampling_ratio/max": 1.9724763870239257, "sampling/importance_sampling_ratio/mean": 0.999966812133789, "sampling/importance_sampling_ratio/min": 0.2238582156598568, "sampling/sampling_logp_difference/max": 1.789958667755127, "sampling/sampling_logp_difference/mean": 0.01525730974972248, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1808.8, "completions/max_terminated_length": 1808.8, "completions/mean_length": 1119.996875, "completions/mean_terminated_length": 1119.996875, "completions/min_length": 755.0, "completions/min_terminated_length": 755.0, "entropy": 0.3681009292602539, "epoch": 0.5111633372502937, "frac_reward_zero_std": 0.05, "grad_norm": 0.8923326730728149, "learning_rate": 4.62927065665132e-07, "loss": 0.01, "num_tokens": 60215142.0, "reward": 0.521875, "reward_std": 0.20172743797302245, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5218750059604644, "rewards/e2e_recall_precision_mixed_reward/std": 0.31086271703243257, "sampling/importance_sampling_ratio/max": 1.9483107328414917, "sampling/importance_sampling_ratio/mean": 0.9998976588249207, "sampling/importance_sampling_ratio/min": 0.3355667650699615, "sampling/sampling_logp_difference/max": 1.200546884536743, "sampling/sampling_logp_difference/mean": 0.01538294218480587, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1521.8, "completions/max_terminated_length": 1521.8, "completions/mean_length": 1014.690625, "completions/mean_terminated_length": 1014.690625, "completions/min_length": 678.4, "completions/min_terminated_length": 678.4, "entropy": 0.374547415971756, "epoch": 0.5170387779083431, "frac_reward_zero_std": 0.15, "grad_norm": 0.7466691136360168, "learning_rate": 4.623212987642355e-07, "loss": -0.0055, "num_tokens": 60849251.0, "reward": 0.627343761920929, "reward_std": 0.18858850598335267, "rewards/e2e_recall_precision_mixed_reward/mean": 0.627343761920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.30483335852622984, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000047564506531, "sampling/importance_sampling_ratio/min": 0.30364882200956345, "sampling/sampling_logp_difference/max": 1.9110549449920655, "sampling/sampling_logp_difference/mean": 0.015604251064360142, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1848.4, "completions/max_terminated_length": 1848.4, "completions/mean_length": 1151.296875, "completions/mean_terminated_length": 1151.296875, "completions/min_length": 780.2, "completions/min_terminated_length": 780.2, "entropy": 0.3709349751472473, "epoch": 0.5229142185663925, "frac_reward_zero_std": 0.25, "grad_norm": 0.8087165355682373, "learning_rate": 4.6171553186333894e-07, "loss": 0.0058, "num_tokens": 61540146.0, "reward": 0.5669270932674408, "reward_std": 0.17213488817214967, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5669270932674408, "rewards/e2e_recall_precision_mixed_reward/std": 0.270868119597435, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000919818878173, "sampling/importance_sampling_ratio/min": 0.3365716278553009, "sampling/sampling_logp_difference/max": 1.2326734781265258, "sampling/sampling_logp_difference/mean": 0.015531861409544944, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1881.0, "completions/max_terminated_length": 1881.0, "completions/mean_length": 1190.221875, "completions/mean_terminated_length": 1190.221875, "completions/min_length": 776.0, "completions/min_terminated_length": 776.0, "entropy": 0.3719481110572815, "epoch": 0.5287896592244419, "frac_reward_zero_std": 0.1, "grad_norm": 0.7721225619316101, "learning_rate": 4.611097649624424e-07, "loss": 0.0083, "num_tokens": 62243545.0, "reward": 0.4819791793823242, "reward_std": 0.2040602147579193, "rewards/e2e_recall_precision_mixed_reward/mean": 0.4819791793823242, "rewards/e2e_recall_precision_mixed_reward/std": 0.27713783383369445, "sampling/importance_sampling_ratio/max": 1.9724904775619507, "sampling/importance_sampling_ratio/mean": 1.000051498413086, "sampling/importance_sampling_ratio/min": 0.29547479525208475, "sampling/sampling_logp_difference/max": 1.5890276193618775, "sampling/sampling_logp_difference/mean": 0.015463878214359284, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1946.0, "completions/max_terminated_length": 1804.2, "completions/mean_length": 1184.165625, "completions/mean_terminated_length": 1180.2292724609374, "completions/min_length": 746.6, "completions/min_terminated_length": 746.6, "entropy": 0.36654204726219175, "epoch": 0.5346650998824912, "frac_reward_zero_std": 0.15, "grad_norm": 0.6808129549026489, "learning_rate": 4.6050399806154586e-07, "loss": 0.0152, "num_tokens": 62939866.0, "reward": 0.46161459684371947, "reward_std": 0.21264611780643464, "rewards/e2e_recall_precision_mixed_reward/mean": 0.46161459684371947, "rewards/e2e_recall_precision_mixed_reward/std": 0.30921355485916135, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999315500259399, "sampling/importance_sampling_ratio/min": 0.19656258896866347, "sampling/sampling_logp_difference/max": 2.8585072994232177, "sampling/sampling_logp_difference/mean": 0.015469144657254219, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1971.4, "completions/max_terminated_length": 1971.4, "completions/mean_length": 1279.0875, "completions/mean_terminated_length": 1279.0875, "completions/min_length": 807.0, "completions/min_terminated_length": 807.0, "entropy": 0.3922215819358826, "epoch": 0.5405405405405406, "frac_reward_zero_std": 0.3, "grad_norm": 0.7750838398933411, "learning_rate": 4.5989823116064935e-07, "loss": 0.0255, "num_tokens": 63669686.0, "reward": 0.5097916722297668, "reward_std": 0.15973457396030427, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5097916722297668, "rewards/e2e_recall_precision_mixed_reward/std": 0.2736150071024895, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999813556671142, "sampling/importance_sampling_ratio/min": 0.093977015838027, "sampling/sampling_logp_difference/max": 2.793697214126587, "sampling/sampling_logp_difference/mean": 0.015985237061977388, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1971.0, "completions/max_terminated_length": 1971.0, "completions/mean_length": 1246.646875, "completions/mean_terminated_length": 1246.646875, "completions/min_length": 780.6, "completions/min_terminated_length": 780.6, "entropy": 0.38005300164222716, "epoch": 0.5464159811985899, "frac_reward_zero_std": 0.25, "grad_norm": 0.7596340775489807, "learning_rate": 4.5929246425975284e-07, "loss": 0.0204, "num_tokens": 64388949.0, "reward": 0.5148958444595337, "reward_std": 0.17929587066173552, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5148958444595337, "rewards/e2e_recall_precision_mixed_reward/std": 0.2609217405319214, "sampling/importance_sampling_ratio/max": 1.9646515369415283, "sampling/importance_sampling_ratio/mean": 1.0001105666160583, "sampling/importance_sampling_ratio/min": 0.3292378157377243, "sampling/sampling_logp_difference/max": 1.3221782445907593, "sampling/sampling_logp_difference/mean": 0.015639954805374147, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1978.4, "completions/max_terminated_length": 1978.4, "completions/mean_length": 1199.6125, "completions/mean_terminated_length": 1199.6125, "completions/min_length": 725.4, "completions/min_terminated_length": 725.4, "entropy": 0.3578630328178406, "epoch": 0.5522914218566393, "frac_reward_zero_std": 0.25, "grad_norm": 0.5981117486953735, "learning_rate": 4.586866973588563e-07, "loss": 0.0069, "num_tokens": 65093913.0, "reward": 0.6246875107288361, "reward_std": 0.18131420761346817, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6246875107288361, "rewards/e2e_recall_precision_mixed_reward/std": 0.28334271609783174, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999997079372406, "sampling/importance_sampling_ratio/min": 0.29355984926223755, "sampling/sampling_logp_difference/max": 1.2378879070281983, "sampling/sampling_logp_difference/mean": 0.015142908878624439, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1809.0, "completions/max_terminated_length": 1809.0, "completions/mean_length": 1175.615625, "completions/mean_terminated_length": 1175.615625, "completions/min_length": 703.0, "completions/min_terminated_length": 703.0, "entropy": 0.3991717994213104, "epoch": 0.5581668625146886, "frac_reward_zero_std": 0.05, "grad_norm": 0.9285194873809814, "learning_rate": 4.5808093045795977e-07, "loss": 0.0292, "num_tokens": 65823646.0, "reward": 0.5126562476158142, "reward_std": 0.21755965948104858, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5126562535762786, "rewards/e2e_recall_precision_mixed_reward/std": 0.29003084897994996, "sampling/importance_sampling_ratio/max": 1.9938787698745728, "sampling/importance_sampling_ratio/mean": 0.9999786853790283, "sampling/importance_sampling_ratio/min": 0.318202418088913, "sampling/sampling_logp_difference/max": 1.212429451942444, "sampling/sampling_logp_difference/mean": 0.01659090295433998, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 2030.6, "completions/max_terminated_length": 1936.0, "completions/mean_length": 1207.65, "completions/mean_terminated_length": 1204.127392578125, "completions/min_length": 737.6, "completions/min_terminated_length": 737.6, "entropy": 0.3959426462650299, "epoch": 0.564042303172738, "frac_reward_zero_std": 0.15, "grad_norm": 0.5899296402931213, "learning_rate": 4.574751635570632e-07, "loss": 0.0117, "num_tokens": 66513274.0, "reward": 0.5737500190734863, "reward_std": 0.19729947447776794, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5737500250339508, "rewards/e2e_recall_precision_mixed_reward/std": 0.287484747171402, "sampling/importance_sampling_ratio/max": 1.983570671081543, "sampling/importance_sampling_ratio/mean": 0.9999786853790283, "sampling/importance_sampling_ratio/min": 0.32921838760375977, "sampling/sampling_logp_difference/max": 1.1545652389526366, "sampling/sampling_logp_difference/mean": 0.016338953003287315, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 2066.4, "completions/max_terminated_length": 1972.4, "completions/mean_length": 1254.078125, "completions/mean_terminated_length": 1250.579541015625, "completions/min_length": 652.0, "completions/min_terminated_length": 652.0, "entropy": 0.4095371961593628, "epoch": 0.5699177438307873, "frac_reward_zero_std": 0.1, "grad_norm": 0.8716078996658325, "learning_rate": 4.568693966561667e-07, "loss": -0.0075, "num_tokens": 67238959.0, "reward": 0.5467187583446502, "reward_std": 0.21395463943481446, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5467187702655792, "rewards/e2e_recall_precision_mixed_reward/std": 0.306264591217041, "sampling/importance_sampling_ratio/max": 1.9498302221298218, "sampling/importance_sampling_ratio/mean": 1.0000301122665405, "sampling/importance_sampling_ratio/min": 0.2736341401934624, "sampling/sampling_logp_difference/max": 1.681512475013733, "sampling/sampling_logp_difference/mean": 0.016801463067531587, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 2096.6, "completions/max_terminated_length": 2082.2, "completions/mean_length": 1255.821875, "completions/mean_terminated_length": 1252.06220703125, "completions/min_length": 655.2, "completions/min_terminated_length": 655.2, "entropy": 0.40909390449523925, "epoch": 0.5757931844888367, "frac_reward_zero_std": 0.25, "grad_norm": 0.6042861342430115, "learning_rate": 4.562636297552702e-07, "loss": 0.0145, "num_tokens": 67953666.0, "reward": 0.5453646063804627, "reward_std": 0.17147594094276428, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5453646063804627, "rewards/e2e_recall_precision_mixed_reward/std": 0.25718378126621244, "sampling/importance_sampling_ratio/max": 1.90937283039093, "sampling/importance_sampling_ratio/mean": 1.000064730644226, "sampling/importance_sampling_ratio/min": 0.350798898935318, "sampling/sampling_logp_difference/max": 1.1801783084869384, "sampling/sampling_logp_difference/mean": 0.016766057163476945, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2113.2, "completions/max_terminated_length": 2105.4, "completions/mean_length": 1227.88125, "completions/mean_terminated_length": 1209.488232421875, "completions/min_length": 691.8, "completions/min_terminated_length": 691.8, "entropy": 0.4179535210132599, "epoch": 0.581668625146886, "frac_reward_zero_std": 0.2, "grad_norm": 0.7605124115943909, "learning_rate": 4.556578628543736e-07, "loss": 0.0052, "num_tokens": 68652744.0, "reward": 0.565625011920929, "reward_std": 0.1908968836069107, "rewards/e2e_recall_precision_mixed_reward/mean": 0.565625011920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.28528952896595, "sampling/importance_sampling_ratio/max": 1.9585810661315919, "sampling/importance_sampling_ratio/mean": 1.0000707626342773, "sampling/importance_sampling_ratio/min": 0.354918098449707, "sampling/sampling_logp_difference/max": 1.057307195663452, "sampling/sampling_logp_difference/mean": 0.017101043835282326, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1780.4, "completions/max_terminated_length": 1780.4, "completions/mean_length": 1157.896875, "completions/mean_terminated_length": 1157.896875, "completions/min_length": 676.2, "completions/min_terminated_length": 676.2, "entropy": 0.4073349952697754, "epoch": 0.5875440658049353, "frac_reward_zero_std": 0.1, "grad_norm": 0.7811326384544373, "learning_rate": 4.550520959534771e-07, "loss": 0.0204, "num_tokens": 69374967.0, "reward": 0.5698958396911621, "reward_std": 0.1886465221643448, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5698958396911621, "rewards/e2e_recall_precision_mixed_reward/std": 0.3525637865066528, "sampling/importance_sampling_ratio/max": 1.851768970489502, "sampling/importance_sampling_ratio/mean": 1.0000535249710083, "sampling/importance_sampling_ratio/min": 0.42424078583717345, "sampling/sampling_logp_difference/max": 1.0173060417175293, "sampling/sampling_logp_difference/mean": 0.016810437291860582, "step": 500 }, { "epoch": 0.5875440658049353, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.000625, "eval_completions/max_length": 1826.52, "eval_completions/max_terminated_length": 1794.64, "eval_completions/mean_length": 1125.531875, "eval_completions/mean_terminated_length": 1124.5743969726564, "eval_completions/min_length": 649.08, "eval_completions/min_terminated_length": 649.08, "eval_entropy": 0.3993370485305786, "eval_frac_reward_zero_std": 0.2, "eval_loss": -0.0014770731795579195, "eval_num_tokens": 69374967.0, "eval_reward": 0.5213125109672546, "eval_reward_std": 0.19255633950233458, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.5213125121593475, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.3224320185184479, "eval_runtime": 482.9379, "eval_samples_per_second": 0.207, "eval_sampling/importance_sampling_ratio/max": 1.9666599893569947, "eval_sampling/importance_sampling_ratio/mean": 0.999978015422821, "eval_sampling/importance_sampling_ratio/min": 0.32408329740166664, "eval_sampling/sampling_logp_difference/max": 1.3359552669525145, "eval_sampling/sampling_logp_difference/mean": 0.01666470244526863, "eval_steps_per_second": 0.004, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2121.8, "completions/max_terminated_length": 2065.2, "completions/mean_length": 1188.903125, "completions/mean_terminated_length": 1169.357568359375, "completions/min_length": 606.6, "completions/min_terminated_length": 606.6, "entropy": 0.41026775240898133, "epoch": 0.5934195064629847, "frac_reward_zero_std": 0.1, "grad_norm": 0.9700555205345154, "learning_rate": 4.5444632905258054e-07, "loss": -0.0078, "num_tokens": 70076500.0, "reward": 0.47963541746139526, "reward_std": 0.24116905331611632, "rewards/e2e_recall_precision_mixed_reward/mean": 0.4796354353427887, "rewards/e2e_recall_precision_mixed_reward/std": 0.3578904628753662, "sampling/importance_sampling_ratio/max": 1.9784984588623047, "sampling/importance_sampling_ratio/mean": 0.9999493837356568, "sampling/importance_sampling_ratio/min": 0.2738966698758304, "sampling/sampling_logp_difference/max": 1.9038654685020446, "sampling/sampling_logp_difference/mean": 0.017127976939082145, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009375, "completions/max_length": 1973.2, "completions/max_terminated_length": 1933.2, "completions/mean_length": 1116.7625, "completions/mean_terminated_length": 1105.833837890625, "completions/min_length": 571.4, "completions/min_terminated_length": 571.4, "entropy": 0.3997494399547577, "epoch": 0.599294947121034, "frac_reward_zero_std": 0.1, "grad_norm": 0.8417585492134094, "learning_rate": 4.5384056215168403e-07, "loss": -0.0091, "num_tokens": 70751708.0, "reward": 0.5510416805744172, "reward_std": 0.24827665388584136, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5510416984558105, "rewards/e2e_recall_precision_mixed_reward/std": 0.35132635235786436, "sampling/importance_sampling_ratio/max": 1.9057066679000854, "sampling/importance_sampling_ratio/mean": 1.00001118183136, "sampling/importance_sampling_ratio/min": 0.34424024224281313, "sampling/sampling_logp_difference/max": 1.115197741985321, "sampling/sampling_logp_difference/mean": 0.016710634157061578, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 2148.0, "completions/max_terminated_length": 2118.6, "completions/mean_length": 1253.075, "completions/mean_terminated_length": 1249.4044189453125, "completions/min_length": 749.8, "completions/min_terminated_length": 749.8, "entropy": 0.39709330797195436, "epoch": 0.6051703877790834, "frac_reward_zero_std": 0.2, "grad_norm": 0.8181708455085754, "learning_rate": 4.5323479525078747e-07, "loss": -0.0217, "num_tokens": 71472448.0, "reward": 0.41421875953674314, "reward_std": 0.18146341145038605, "rewards/e2e_recall_precision_mixed_reward/mean": 0.41421875953674314, "rewards/e2e_recall_precision_mixed_reward/std": 0.2794097185134888, "sampling/importance_sampling_ratio/max": 1.890633797645569, "sampling/importance_sampling_ratio/mean": 0.9998977184295654, "sampling/importance_sampling_ratio/min": 0.361523362994194, "sampling/sampling_logp_difference/max": 1.0541168928146363, "sampling/sampling_logp_difference/mean": 0.01627396307885647, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1854.6, "completions/max_terminated_length": 1854.6, "completions/mean_length": 1165.190625, "completions/mean_terminated_length": 1165.190625, "completions/min_length": 655.6, "completions/min_terminated_length": 655.6, "entropy": 0.3984744131565094, "epoch": 0.6110458284371327, "frac_reward_zero_std": 0.2, "grad_norm": 0.802769660949707, "learning_rate": 4.526290283498909e-07, "loss": 0.0145, "num_tokens": 72225469.0, "reward": 0.48208335041999817, "reward_std": 0.20398985743522643, "rewards/e2e_recall_precision_mixed_reward/mean": 0.48208335041999817, "rewards/e2e_recall_precision_mixed_reward/std": 0.31512056589126586, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999612689018249, "sampling/importance_sampling_ratio/min": 0.31723029613494874, "sampling/sampling_logp_difference/max": 1.2719123601913451, "sampling/sampling_logp_difference/mean": 0.016700203344225884, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1694.2, "completions/max_terminated_length": 1694.2, "completions/mean_length": 1077.79375, "completions/mean_terminated_length": 1077.79375, "completions/min_length": 639.2, "completions/min_terminated_length": 639.2, "entropy": 0.390575510263443, "epoch": 0.6169212690951822, "frac_reward_zero_std": 0.25, "grad_norm": 0.8257818222045898, "learning_rate": 4.520232614489944e-07, "loss": -0.0109, "num_tokens": 72919387.0, "reward": 0.6024479389190673, "reward_std": 0.18384309411048888, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6024479389190673, "rewards/e2e_recall_precision_mixed_reward/std": 0.29759698510169985, "sampling/importance_sampling_ratio/max": 1.951656460762024, "sampling/importance_sampling_ratio/mean": 0.9999419927597046, "sampling/importance_sampling_ratio/min": 0.4299475193023682, "sampling/sampling_logp_difference/max": 0.8493948101997375, "sampling/sampling_logp_difference/mean": 0.016626672819256783, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1810.4, "completions/max_terminated_length": 1810.4, "completions/mean_length": 1170.88125, "completions/mean_terminated_length": 1170.88125, "completions/min_length": 722.6, "completions/min_terminated_length": 722.6, "entropy": 0.37268310189247134, "epoch": 0.6227967097532315, "frac_reward_zero_std": 0.1, "grad_norm": 0.9070501923561096, "learning_rate": 4.5141749454809783e-07, "loss": 0.0075, "num_tokens": 73614293.0, "reward": 0.5833333432674408, "reward_std": 0.19135434031486512, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5833333432674408, "rewards/e2e_recall_precision_mixed_reward/std": 0.2914866417646408, "sampling/importance_sampling_ratio/max": 1.9210278034210204, "sampling/importance_sampling_ratio/mean": 0.9998873472213745, "sampling/importance_sampling_ratio/min": 0.3119745343923569, "sampling/sampling_logp_difference/max": 1.2648154497146606, "sampling/sampling_logp_difference/mean": 0.015689116902649403, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1951.2, "completions/max_terminated_length": 1881.0, "completions/mean_length": 1136.20625, "completions/mean_terminated_length": 1132.3883056640625, "completions/min_length": 609.6, "completions/min_terminated_length": 609.6, "entropy": 0.37793651819229124, "epoch": 0.6286721504112809, "frac_reward_zero_std": 0.2, "grad_norm": 0.7828454971313477, "learning_rate": 4.508117276472013e-07, "loss": 0.0106, "num_tokens": 74277411.0, "reward": 0.49088543057441714, "reward_std": 0.1814019948244095, "rewards/e2e_recall_precision_mixed_reward/mean": 0.49088541865348817, "rewards/e2e_recall_precision_mixed_reward/std": 0.3006050676107407, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999652862548828, "sampling/importance_sampling_ratio/min": 0.42095847725868224, "sampling/sampling_logp_difference/max": 1.4004887104034425, "sampling/sampling_logp_difference/mean": 0.015673627704381944, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009375, "completions/max_length": 1927.2, "completions/max_terminated_length": 1927.2, "completions/mean_length": 1194.4, "completions/mean_terminated_length": 1183.4655029296875, "completions/min_length": 722.4, "completions/min_terminated_length": 722.4, "entropy": 0.3972707211971283, "epoch": 0.6345475910693302, "frac_reward_zero_std": 0.2, "grad_norm": 0.7965516448020935, "learning_rate": 4.502059607463048e-07, "loss": -0.0058, "num_tokens": 74952935.0, "reward": 0.49776042699813844, "reward_std": 0.18791168928146362, "rewards/e2e_recall_precision_mixed_reward/mean": 0.49776042699813844, "rewards/e2e_recall_precision_mixed_reward/std": 0.32319254875183107, "sampling/importance_sampling_ratio/max": 1.9314549446105957, "sampling/importance_sampling_ratio/mean": 0.9998380184173584, "sampling/importance_sampling_ratio/min": 0.3149589985609055, "sampling/sampling_logp_difference/max": 1.28137925863266, "sampling/sampling_logp_difference/mean": 0.016445842757821082, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 2184.8, "completions/max_terminated_length": 2125.4, "completions/mean_length": 1200.940625, "completions/mean_terminated_length": 1196.8129150390625, "completions/min_length": 651.6, "completions/min_terminated_length": 651.6, "entropy": 0.35937982201576235, "epoch": 0.6404230317273796, "frac_reward_zero_std": 0.25, "grad_norm": 0.6078236699104309, "learning_rate": 4.4960019384540825e-07, "loss": -0.0231, "num_tokens": 75673568.0, "reward": 0.5217708289623261, "reward_std": 0.1847947582602501, "rewards/e2e_recall_precision_mixed_reward/mean": 0.521770840883255, "rewards/e2e_recall_precision_mixed_reward/std": 0.3309717208147049, "sampling/importance_sampling_ratio/max": 1.9890816926956176, "sampling/importance_sampling_ratio/mean": 0.999953544139862, "sampling/importance_sampling_ratio/min": 0.2543433949351311, "sampling/sampling_logp_difference/max": 1.4823894262313844, "sampling/sampling_logp_difference/mean": 0.015391046553850174, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 2021.8, "completions/max_terminated_length": 1978.0, "completions/mean_length": 1226.0375, "completions/mean_terminated_length": 1218.3116943359375, "completions/min_length": 760.8, "completions/min_terminated_length": 760.8, "entropy": 0.3644507646560669, "epoch": 0.6462984723854289, "frac_reward_zero_std": 0.15, "grad_norm": 0.7988547086715698, "learning_rate": 4.4899442694451174e-07, "loss": -0.01, "num_tokens": 76352900.0, "reward": 0.5784375071525574, "reward_std": 0.19074564576148986, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5784375071525574, "rewards/e2e_recall_precision_mixed_reward/std": 0.29962594509124757, "sampling/importance_sampling_ratio/max": 1.9379316568374634, "sampling/importance_sampling_ratio/mean": 1.0000091791152954, "sampling/importance_sampling_ratio/min": 0.27861389741301534, "sampling/sampling_logp_difference/max": 1.5992833375930786, "sampling/sampling_logp_difference/mean": 0.015281017497181892, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1948.0, "completions/max_terminated_length": 1948.0, "completions/mean_length": 1158.328125, "completions/mean_terminated_length": 1158.328125, "completions/min_length": 670.8, "completions/min_terminated_length": 670.8, "entropy": 0.3680810987949371, "epoch": 0.6521739130434783, "frac_reward_zero_std": 0.1, "grad_norm": 0.7596303820610046, "learning_rate": 4.4838866004361517e-07, "loss": 0.0209, "num_tokens": 77036733.0, "reward": 0.6009375095367432, "reward_std": 0.20273202657699585, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6009375214576721, "rewards/e2e_recall_precision_mixed_reward/std": 0.2976983994245529, "sampling/importance_sampling_ratio/max": 1.8425412893295288, "sampling/importance_sampling_ratio/mean": 0.9999056339263916, "sampling/importance_sampling_ratio/min": 0.365853750705719, "sampling/sampling_logp_difference/max": 1.008244252204895, "sampling/sampling_logp_difference/mean": 0.015644372254610062, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1914.8, "completions/max_terminated_length": 1914.8, "completions/mean_length": 1163.9125, "completions/mean_terminated_length": 1163.9125, "completions/min_length": 646.0, "completions/min_terminated_length": 646.0, "entropy": 0.34467315673828125, "epoch": 0.6580493537015276, "frac_reward_zero_std": 0.25, "grad_norm": 0.5723123550415039, "learning_rate": 4.4778289314271866e-07, "loss": 0.0259, "num_tokens": 77729553.0, "reward": 0.5305729269981384, "reward_std": 0.15761651396751403, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5305729269981384, "rewards/e2e_recall_precision_mixed_reward/std": 0.3086104154586792, "sampling/importance_sampling_ratio/max": 1.9935364246368408, "sampling/importance_sampling_ratio/mean": 0.9999402403831482, "sampling/importance_sampling_ratio/min": 0.3442982375621796, "sampling/sampling_logp_difference/max": 1.0778313636779786, "sampling/sampling_logp_difference/mean": 0.014830858074128627, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1942.2, "completions/max_terminated_length": 1942.2, "completions/mean_length": 1147.51875, "completions/mean_terminated_length": 1147.51875, "completions/min_length": 760.6, "completions/min_terminated_length": 760.6, "entropy": 0.3442341387271881, "epoch": 0.663924794359577, "frac_reward_zero_std": 0.25, "grad_norm": 0.6711921095848083, "learning_rate": 4.4717712624182215e-07, "loss": 0.0053, "num_tokens": 78408247.0, "reward": 0.5476041793823242, "reward_std": 0.14775162786245347, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5476041913032532, "rewards/e2e_recall_precision_mixed_reward/std": 0.29104921221733093, "sampling/importance_sampling_ratio/max": 1.9945608377456665, "sampling/importance_sampling_ratio/mean": 1.0000170111656188, "sampling/importance_sampling_ratio/min": 0.41033602952957154, "sampling/sampling_logp_difference/max": 0.9867128372192383, "sampling/sampling_logp_difference/mean": 0.014713744446635247, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1990.6, "completions/max_terminated_length": 1891.4, "completions/mean_length": 1251.853125, "completions/mean_terminated_length": 1247.9536865234375, "completions/min_length": 734.2, "completions/min_terminated_length": 734.2, "entropy": 0.36719573736190797, "epoch": 0.6698002350176263, "frac_reward_zero_std": 0.15, "grad_norm": 0.8351427912712097, "learning_rate": 4.465713593409256e-07, "loss": 0.0039, "num_tokens": 79119076.0, "reward": 0.5445312678813934, "reward_std": 0.21431024968624116, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5445312678813934, "rewards/e2e_recall_precision_mixed_reward/std": 0.29993789792060854, "sampling/importance_sampling_ratio/max": 1.9819918394088745, "sampling/importance_sampling_ratio/mean": 1.0000818252563477, "sampling/importance_sampling_ratio/min": 0.353739058971405, "sampling/sampling_logp_difference/max": 1.094305396080017, "sampling/sampling_logp_difference/mean": 0.015493137948215007, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1775.6, "completions/max_terminated_length": 1775.6, "completions/mean_length": 1123.984375, "completions/mean_terminated_length": 1123.984375, "completions/min_length": 751.6, "completions/min_terminated_length": 751.6, "entropy": 0.3667173147201538, "epoch": 0.6756756756756757, "frac_reward_zero_std": 0.25, "grad_norm": 0.6817618608474731, "learning_rate": 4.459655924400291e-07, "loss": 0.0128, "num_tokens": 79775567.0, "reward": 0.6600000143051148, "reward_std": 0.1561010330915451, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6600000143051148, "rewards/e2e_recall_precision_mixed_reward/std": 0.27142361700534823, "sampling/importance_sampling_ratio/max": 1.961619234085083, "sampling/importance_sampling_ratio/mean": 1.0000163555145263, "sampling/importance_sampling_ratio/min": 0.36026729345321656, "sampling/sampling_logp_difference/max": 1.0491438388824463, "sampling/sampling_logp_difference/mean": 0.01550084501504898, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1770.6, "completions/max_terminated_length": 1770.6, "completions/mean_length": 1138.36875, "completions/mean_terminated_length": 1138.36875, "completions/min_length": 691.6, "completions/min_terminated_length": 691.6, "entropy": 0.3702571511268616, "epoch": 0.681551116333725, "frac_reward_zero_std": 0.2, "grad_norm": 0.885202944278717, "learning_rate": 4.453598255391325e-07, "loss": 0.008, "num_tokens": 80422453.0, "reward": 0.5227083444595337, "reward_std": 0.1915093831717968, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5227083444595337, "rewards/e2e_recall_precision_mixed_reward/std": 0.2948000729084015, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999974846839905, "sampling/importance_sampling_ratio/min": 0.41094982624053955, "sampling/sampling_logp_difference/max": 1.0436588287353517, "sampling/sampling_logp_difference/mean": 0.01540203858166933, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 1851.2, "completions/max_terminated_length": 1843.6, "completions/mean_length": 1159.803125, "completions/mean_terminated_length": 1152.411279296875, "completions/min_length": 670.4, "completions/min_terminated_length": 670.4, "entropy": 0.3673887014389038, "epoch": 0.6874265569917744, "frac_reward_zero_std": 0.3, "grad_norm": 0.6437573432922363, "learning_rate": 4.44754058638236e-07, "loss": -0.006, "num_tokens": 81093870.0, "reward": 0.6208333611488343, "reward_std": 0.15290768593549728, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6208333611488343, "rewards/e2e_recall_precision_mixed_reward/std": 0.2849018633365631, "sampling/importance_sampling_ratio/max": 1.8680822134017945, "sampling/importance_sampling_ratio/mean": 1.0000381112098693, "sampling/importance_sampling_ratio/min": 0.378055739402771, "sampling/sampling_logp_difference/max": 1.003414511680603, "sampling/sampling_logp_difference/mean": 0.015409078076481819, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1771.8, "completions/max_terminated_length": 1771.8, "completions/mean_length": 1149.809375, "completions/mean_terminated_length": 1149.809375, "completions/min_length": 723.2, "completions/min_terminated_length": 723.2, "entropy": 0.36604792475700376, "epoch": 0.6933019976498237, "frac_reward_zero_std": 0.1, "grad_norm": 0.9135239720344543, "learning_rate": 4.441482917373395e-07, "loss": 0.0199, "num_tokens": 81789857.0, "reward": 0.6539062619209289, "reward_std": 0.19954511225223542, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6539062619209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.24059314727783204, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000005555152893, "sampling/importance_sampling_ratio/min": 0.4236325442790985, "sampling/sampling_logp_difference/max": 1.108893585205078, "sampling/sampling_logp_difference/mean": 0.015483367443084716, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1860.4, "completions/max_terminated_length": 1860.4, "completions/mean_length": 1070.809375, "completions/mean_terminated_length": 1070.809375, "completions/min_length": 663.2, "completions/min_terminated_length": 663.2, "entropy": 0.34452658891677856, "epoch": 0.699177438307873, "frac_reward_zero_std": 0.15, "grad_norm": 0.9200600981712341, "learning_rate": 4.4354252483644293e-07, "loss": 0.0155, "num_tokens": 82480708.0, "reward": 0.5846354305744171, "reward_std": 0.17398979663848876, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5846354305744171, "rewards/e2e_recall_precision_mixed_reward/std": 0.3412102937698364, "sampling/importance_sampling_ratio/max": 1.9801399946212768, "sampling/importance_sampling_ratio/mean": 0.9999470949172974, "sampling/importance_sampling_ratio/min": 0.43194087147712706, "sampling/sampling_logp_difference/max": 1.0336421251296997, "sampling/sampling_logp_difference/mean": 0.014913180842995644, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 2101.6, "completions/max_terminated_length": 2073.4, "completions/mean_length": 1216.24375, "completions/mean_terminated_length": 1208.71279296875, "completions/min_length": 721.6, "completions/min_terminated_length": 721.6, "entropy": 0.3706329584121704, "epoch": 0.7050528789659224, "frac_reward_zero_std": 0.25, "grad_norm": 0.7335398197174072, "learning_rate": 4.4293675793554636e-07, "loss": -0.0294, "num_tokens": 83165354.0, "reward": 0.600885421037674, "reward_std": 0.15805183053016664, "rewards/e2e_recall_precision_mixed_reward/mean": 0.600885421037674, "rewards/e2e_recall_precision_mixed_reward/std": 0.2339630365371704, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000630497932435, "sampling/importance_sampling_ratio/min": 0.29682928044348955, "sampling/sampling_logp_difference/max": 1.9509897232055664, "sampling/sampling_logp_difference/mean": 0.015584040805697441, "step": 600 }, { "epoch": 0.7050528789659224, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.00125, "eval_completions/max_length": 1885.44, "eval_completions/max_terminated_length": 1822.08, "eval_completions/mean_length": 1111.4675, "eval_completions/mean_terminated_length": 1109.69341796875, "eval_completions/min_length": 681.28, "eval_completions/min_terminated_length": 681.28, "eval_entropy": 0.35809629917144775, "eval_frac_reward_zero_std": 0.2, "eval_loss": 0.0031391119118779898, "eval_num_tokens": 83165354.0, "eval_reward": 0.5702604281902314, "eval_reward_std": 0.18180499017238616, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.5702604305744171, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.31943190515041353, "eval_runtime": 484.4398, "eval_samples_per_second": 0.206, "eval_sampling/importance_sampling_ratio/max": 1.9421213483810424, "eval_sampling/importance_sampling_ratio/mean": 1.000009124279022, "eval_sampling/importance_sampling_ratio/min": 0.32641365081071855, "eval_sampling/sampling_logp_difference/max": 1.2518345785140992, "eval_sampling/sampling_logp_difference/mean": 0.015406711027026176, "eval_steps_per_second": 0.004, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1916.6, "completions/max_terminated_length": 1916.6, "completions/mean_length": 1208.140625, "completions/mean_terminated_length": 1208.140625, "completions/min_length": 767.4, "completions/min_terminated_length": 767.4, "entropy": 0.36099555492401125, "epoch": 0.7109283196239718, "frac_reward_zero_std": 0.2, "grad_norm": 0.9186404943466187, "learning_rate": 4.423309910346498e-07, "loss": 0.0137, "num_tokens": 83870567.0, "reward": 0.6028645992279053, "reward_std": 0.18433087766170503, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6028645992279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.27474440932273864, "sampling/importance_sampling_ratio/max": 1.9217813730239868, "sampling/importance_sampling_ratio/mean": 1.0000322818756104, "sampling/importance_sampling_ratio/min": 0.33838503062725067, "sampling/sampling_logp_difference/max": 1.2144441843032836, "sampling/sampling_logp_difference/mean": 0.015485203452408313, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1942.8, "completions/max_terminated_length": 1843.8, "completions/mean_length": 1060.925, "completions/mean_terminated_length": 1056.8969482421876, "completions/min_length": 572.0, "completions/min_terminated_length": 572.0, "entropy": 0.3543337404727936, "epoch": 0.7168037602820212, "frac_reward_zero_std": 0.2, "grad_norm": 0.8076752424240112, "learning_rate": 4.417252241337533e-07, "loss": -0.0074, "num_tokens": 84533131.0, "reward": 0.6001562714576721, "reward_std": 0.1616973862051964, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6001562714576721, "rewards/e2e_recall_precision_mixed_reward/std": 0.2991356194019318, "sampling/importance_sampling_ratio/max": 1.8717167854309082, "sampling/importance_sampling_ratio/mean": 0.9999955654144287, "sampling/importance_sampling_ratio/min": 0.38679862320423125, "sampling/sampling_logp_difference/max": 1.0169011354446411, "sampling/sampling_logp_difference/mean": 0.015325301699340344, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1790.4, "completions/max_terminated_length": 1790.4, "completions/mean_length": 1141.934375, "completions/mean_terminated_length": 1141.934375, "completions/min_length": 691.8, "completions/min_terminated_length": 691.8, "entropy": 0.36986821293830874, "epoch": 0.7226792009400705, "frac_reward_zero_std": 0.3, "grad_norm": 0.7624120712280273, "learning_rate": 4.411194572328568e-07, "loss": -0.0142, "num_tokens": 85203078.0, "reward": 0.6421875119209289, "reward_std": 0.15983470678329467, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6421875119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.24894966185092926, "sampling/importance_sampling_ratio/max": 1.9372856378555299, "sampling/importance_sampling_ratio/mean": 0.9998981475830078, "sampling/importance_sampling_ratio/min": 0.3545094013214111, "sampling/sampling_logp_difference/max": 1.174563217163086, "sampling/sampling_logp_difference/mean": 0.015732382237911225, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1923.0, "completions/max_terminated_length": 1923.0, "completions/mean_length": 1150.5875, "completions/mean_terminated_length": 1150.5875, "completions/min_length": 751.0, "completions/min_terminated_length": 751.0, "entropy": 0.3715690791606903, "epoch": 0.7285546415981199, "frac_reward_zero_std": 0.2, "grad_norm": 0.5657132863998413, "learning_rate": 4.405136903319602e-07, "loss": 0.0199, "num_tokens": 85870882.0, "reward": 0.5806770920753479, "reward_std": 0.17638582587242127, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5806770920753479, "rewards/e2e_recall_precision_mixed_reward/std": 0.3543856680393219, "sampling/importance_sampling_ratio/max": 1.9489038705825805, "sampling/importance_sampling_ratio/mean": 1.0000184893608093, "sampling/importance_sampling_ratio/min": 0.34882347881793974, "sampling/sampling_logp_difference/max": 1.131108021736145, "sampling/sampling_logp_difference/mean": 0.015539542399346828, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 2070.0, "completions/max_terminated_length": 1961.2, "completions/mean_length": 1203.53125, "completions/mean_terminated_length": 1199.6821533203124, "completions/min_length": 765.4, "completions/min_terminated_length": 765.4, "entropy": 0.364533132314682, "epoch": 0.7344300822561692, "frac_reward_zero_std": 0.2, "grad_norm": 0.6601881980895996, "learning_rate": 4.399079234310637e-07, "loss": 0.0201, "num_tokens": 86598024.0, "reward": 0.5711979389190673, "reward_std": 0.18835237622261047, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5711979389190673, "rewards/e2e_recall_precision_mixed_reward/std": 0.3303461790084839, "sampling/importance_sampling_ratio/max": 1.9624412298202514, "sampling/importance_sampling_ratio/mean": 0.9998656749725342, "sampling/importance_sampling_ratio/min": 0.37342591881752013, "sampling/sampling_logp_difference/max": 1.0346963167190553, "sampling/sampling_logp_difference/mean": 0.015363234095275402, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1870.8, "completions/max_terminated_length": 1870.8, "completions/mean_length": 1130.415625, "completions/mean_terminated_length": 1130.415625, "completions/min_length": 705.6, "completions/min_terminated_length": 705.6, "entropy": 0.3572323977947235, "epoch": 0.7403055229142186, "frac_reward_zero_std": 0.25, "grad_norm": 0.4808492064476013, "learning_rate": 4.3930215653016714e-07, "loss": 0.0067, "num_tokens": 87275725.0, "reward": 0.5479687631130219, "reward_std": 0.1671212889254093, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5479687631130219, "rewards/e2e_recall_precision_mixed_reward/std": 0.2871303677558899, "sampling/importance_sampling_ratio/max": 1.9045668125152588, "sampling/importance_sampling_ratio/mean": 1.0000359773635865, "sampling/importance_sampling_ratio/min": 0.390330970287323, "sampling/sampling_logp_difference/max": 0.9632290363311767, "sampling/sampling_logp_difference/mean": 0.015123646520078183, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1733.6, "completions/max_terminated_length": 1733.6, "completions/mean_length": 1164.278125, "completions/mean_terminated_length": 1164.278125, "completions/min_length": 786.2, "completions/min_terminated_length": 786.2, "entropy": 0.38033042550086976, "epoch": 0.7461809635722679, "frac_reward_zero_std": 0.15, "grad_norm": 0.8040631413459778, "learning_rate": 4.3869638962927063e-07, "loss": 0.0232, "num_tokens": 87977846.0, "reward": 0.6135416746139526, "reward_std": 0.17617344856262207, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6135416805744172, "rewards/e2e_recall_precision_mixed_reward/std": 0.30681161880493163, "sampling/importance_sampling_ratio/max": 1.8931318521499634, "sampling/importance_sampling_ratio/mean": 0.9999570846557617, "sampling/importance_sampling_ratio/min": 0.42780248522758485, "sampling/sampling_logp_difference/max": 0.8543825387954712, "sampling/sampling_logp_difference/mean": 0.0160001702606678, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2039.0, "completions/max_terminated_length": 1983.8, "completions/mean_length": 1142.871875, "completions/mean_terminated_length": 1123.12822265625, "completions/min_length": 655.6, "completions/min_terminated_length": 655.6, "entropy": 0.3652238368988037, "epoch": 0.7520564042303173, "frac_reward_zero_std": 0.25, "grad_norm": 0.8171776533126831, "learning_rate": 4.380906227283741e-07, "loss": 0.0168, "num_tokens": 88659353.0, "reward": 0.645781260728836, "reward_std": 0.16527300626039504, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6457812786102295, "rewards/e2e_recall_precision_mixed_reward/std": 0.32556850016117095, "sampling/importance_sampling_ratio/max": 1.9582969903945924, "sampling/importance_sampling_ratio/mean": 0.999891209602356, "sampling/importance_sampling_ratio/min": 0.3786491721868515, "sampling/sampling_logp_difference/max": 1.0249874234199523, "sampling/sampling_logp_difference/mean": 0.015227846615016461, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1975.0, "completions/max_terminated_length": 1815.6, "completions/mean_length": 1169.203125, "completions/mean_terminated_length": 1165.0383544921874, "completions/min_length": 741.8, "completions/min_terminated_length": 741.8, "entropy": 0.3585414707660675, "epoch": 0.7579318448883666, "frac_reward_zero_std": 0.15, "grad_norm": 0.8918953537940979, "learning_rate": 4.3748485582747756e-07, "loss": 0.0061, "num_tokens": 89353142.0, "reward": 0.5856250166893006, "reward_std": 0.22155131995677949, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5856250107288361, "rewards/e2e_recall_precision_mixed_reward/std": 0.2939130663871765, "sampling/importance_sampling_ratio/max": 1.9639912843704224, "sampling/importance_sampling_ratio/mean": 0.9999972224235535, "sampling/importance_sampling_ratio/min": 0.37892901003360746, "sampling/sampling_logp_difference/max": 0.9988775253295898, "sampling/sampling_logp_difference/mean": 0.014916538074612618, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2024.8, "completions/max_terminated_length": 2024.8, "completions/mean_length": 1198.5, "completions/mean_terminated_length": 1198.5, "completions/min_length": 787.4, "completions/min_terminated_length": 787.4, "entropy": 0.3784613788127899, "epoch": 0.763807285546416, "frac_reward_zero_std": 0.25, "grad_norm": 0.806064248085022, "learning_rate": 4.3687908892658105e-07, "loss": 0.001, "num_tokens": 90064694.0, "reward": 0.6430729389190674, "reward_std": 0.19357036650180817, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6430729389190674, "rewards/e2e_recall_precision_mixed_reward/std": 0.31154467463493346, "sampling/importance_sampling_ratio/max": 1.9199378967285157, "sampling/importance_sampling_ratio/mean": 0.9998810887336731, "sampling/importance_sampling_ratio/min": 0.29518369734287264, "sampling/sampling_logp_difference/max": 1.2997811555862426, "sampling/sampling_logp_difference/mean": 0.0156770009547472, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1619.2, "completions/max_terminated_length": 1619.2, "completions/mean_length": 1067.253125, "completions/mean_terminated_length": 1067.253125, "completions/min_length": 648.0, "completions/min_terminated_length": 648.0, "entropy": 0.3725395202636719, "epoch": 0.7696827262044653, "frac_reward_zero_std": 0.3, "grad_norm": 0.9183342456817627, "learning_rate": 4.3627332202568454e-07, "loss": 0.0011, "num_tokens": 90727559.0, "reward": 0.5644791722297668, "reward_std": 0.1635679453611374, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5644791722297668, "rewards/e2e_recall_precision_mixed_reward/std": 0.37316291928291323, "sampling/importance_sampling_ratio/max": 1.9169376850128175, "sampling/importance_sampling_ratio/mean": 0.9999460935592651, "sampling/importance_sampling_ratio/min": 0.3552225947380066, "sampling/sampling_logp_difference/max": 1.0906750798225402, "sampling/sampling_logp_difference/mean": 0.015604752302169799, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1765.0, "completions/max_terminated_length": 1765.0, "completions/mean_length": 1024.040625, "completions/mean_terminated_length": 1024.040625, "completions/min_length": 587.4, "completions/min_terminated_length": 587.4, "entropy": 0.3731545150279999, "epoch": 0.7755581668625147, "frac_reward_zero_std": 0.35, "grad_norm": 0.8010883927345276, "learning_rate": 4.3566755512478797e-07, "loss": -0.0083, "num_tokens": 91353300.0, "reward": 0.6674479246139526, "reward_std": 0.1368040680885315, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6674479246139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.3341027498245239, "sampling/importance_sampling_ratio/max": 1.926101303100586, "sampling/importance_sampling_ratio/mean": 0.9999330401420593, "sampling/importance_sampling_ratio/min": 0.28611020296812056, "sampling/sampling_logp_difference/max": 1.6566476583480836, "sampling/sampling_logp_difference/mean": 0.01547999307513237, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1819.4, "completions/max_terminated_length": 1703.6, "completions/mean_length": 1121.246875, "completions/mean_terminated_length": 1117.4317626953125, "completions/min_length": 722.4, "completions/min_terminated_length": 722.4, "entropy": 0.3596222817897797, "epoch": 0.781433607520564, "frac_reward_zero_std": 0.3, "grad_norm": 0.37091967463493347, "learning_rate": 4.3506178822389146e-07, "loss": 0.0105, "num_tokens": 92051055.0, "reward": 0.5822916746139526, "reward_std": 0.14248881340026856, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5822916805744172, "rewards/e2e_recall_precision_mixed_reward/std": 0.28063163459300994, "sampling/importance_sampling_ratio/max": 1.9675161600112916, "sampling/importance_sampling_ratio/mean": 1.000024175643921, "sampling/importance_sampling_ratio/min": 0.34341188669204714, "sampling/sampling_logp_difference/max": 1.362108290195465, "sampling/sampling_logp_difference/mean": 0.015318811498582363, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1901.6, "completions/max_terminated_length": 1901.6, "completions/mean_length": 1049.240625, "completions/mean_terminated_length": 1049.240625, "completions/min_length": 705.2, "completions/min_terminated_length": 705.2, "entropy": 0.3371972322463989, "epoch": 0.7873090481786134, "frac_reward_zero_std": 0.15, "grad_norm": 0.9466339945793152, "learning_rate": 4.344560213229949e-07, "loss": 0.0087, "num_tokens": 92731788.0, "reward": 0.621875, "reward_std": 0.17843145430088042, "rewards/e2e_recall_precision_mixed_reward/mean": 0.621875, "rewards/e2e_recall_precision_mixed_reward/std": 0.3395528972148895, "sampling/importance_sampling_ratio/max": 1.868542456626892, "sampling/importance_sampling_ratio/mean": 1.0001242876052856, "sampling/importance_sampling_ratio/min": 0.31800718009471896, "sampling/sampling_logp_difference/max": 1.1785770416259767, "sampling/sampling_logp_difference/mean": 0.014529785700142383, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1871.6, "completions/max_terminated_length": 1871.6, "completions/mean_length": 1122.5875, "completions/mean_terminated_length": 1122.5875, "completions/min_length": 654.0, "completions/min_terminated_length": 654.0, "entropy": 0.3579367399215698, "epoch": 0.7931844888366627, "frac_reward_zero_std": 0.2, "grad_norm": 0.8181319832801819, "learning_rate": 4.338502544220984e-07, "loss": 0.0265, "num_tokens": 93410776.0, "reward": 0.5313541889190674, "reward_std": 0.16972679942846297, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5313541829586029, "rewards/e2e_recall_precision_mixed_reward/std": 0.28352001309394836, "sampling/importance_sampling_ratio/max": 1.973141074180603, "sampling/importance_sampling_ratio/mean": 1.0001737236976624, "sampling/importance_sampling_ratio/min": 0.4472075402736664, "sampling/sampling_logp_difference/max": 0.917256760597229, "sampling/sampling_logp_difference/mean": 0.015095295198261739, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1637.2, "completions/max_terminated_length": 1637.2, "completions/mean_length": 1013.86875, "completions/mean_terminated_length": 1013.86875, "completions/min_length": 630.2, "completions/min_terminated_length": 630.2, "entropy": 0.34675438404083253, "epoch": 0.799059929494712, "frac_reward_zero_std": 0.25, "grad_norm": 0.9746776819229126, "learning_rate": 4.3324448752120177e-07, "loss": -0.0082, "num_tokens": 94042670.0, "reward": 0.6797916769981385, "reward_std": 0.14540175199508668, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6797916769981385, "rewards/e2e_recall_precision_mixed_reward/std": 0.31566589772701265, "sampling/importance_sampling_ratio/max": 1.8168800592422485, "sampling/importance_sampling_ratio/mean": 0.9998704791069031, "sampling/importance_sampling_ratio/min": 0.343018639087677, "sampling/sampling_logp_difference/max": 1.1331130743026734, "sampling/sampling_logp_difference/mean": 0.014893803745508194, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1943.6, "completions/max_terminated_length": 1943.6, "completions/mean_length": 1118.8375, "completions/mean_terminated_length": 1118.8375, "completions/min_length": 641.8, "completions/min_terminated_length": 641.8, "entropy": 0.3540935754776001, "epoch": 0.8049353701527615, "frac_reward_zero_std": 0.3, "grad_norm": 0.9660540223121643, "learning_rate": 4.3263872062030526e-07, "loss": -0.0026, "num_tokens": 94725594.0, "reward": 0.6530729293823242, "reward_std": 0.10995356068015098, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6530729293823242, "rewards/e2e_recall_precision_mixed_reward/std": 0.2734446346759796, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000356674194335, "sampling/importance_sampling_ratio/min": 0.2624353013234213, "sampling/sampling_logp_difference/max": 2.294406795501709, "sampling/sampling_logp_difference/mean": 0.015182534791529178, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1589.6, "completions/max_terminated_length": 1589.6, "completions/mean_length": 1034.95, "completions/mean_terminated_length": 1034.95, "completions/min_length": 558.2, "completions/min_terminated_length": 558.2, "entropy": 0.3575005650520325, "epoch": 0.8108108108108109, "frac_reward_zero_std": 0.15, "grad_norm": 1.0704731941223145, "learning_rate": 4.3203295371940875e-07, "loss": 0.0023, "num_tokens": 95370026.0, "reward": 0.6488541722297668, "reward_std": 0.19183206856250762, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6488541722297668, "rewards/e2e_recall_precision_mixed_reward/std": 0.3406189620494843, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000473976135253, "sampling/importance_sampling_ratio/min": 0.361629045009613, "sampling/sampling_logp_difference/max": 1.0578797578811645, "sampling/sampling_logp_difference/mean": 0.015206655859947205, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1792.8, "completions/max_terminated_length": 1792.8, "completions/mean_length": 1117.090625, "completions/mean_terminated_length": 1117.090625, "completions/min_length": 713.0, "completions/min_terminated_length": 713.0, "entropy": 0.3441681444644928, "epoch": 0.8166862514688602, "frac_reward_zero_std": 0.1, "grad_norm": 0.8466465473175049, "learning_rate": 4.314271868185122e-07, "loss": 0.0066, "num_tokens": 96058743.0, "reward": 0.6003645956516266, "reward_std": 0.21395085752010345, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6003645956516266, "rewards/e2e_recall_precision_mixed_reward/std": 0.3233032822608948, "sampling/importance_sampling_ratio/max": 1.8979068994522095, "sampling/importance_sampling_ratio/mean": 1.0000794172286986, "sampling/importance_sampling_ratio/min": 0.38034394979476926, "sampling/sampling_logp_difference/max": 1.0374782085418701, "sampling/sampling_logp_difference/mean": 0.01497993227094412, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1584.2, "completions/max_terminated_length": 1584.2, "completions/mean_length": 1046.925, "completions/mean_terminated_length": 1046.925, "completions/min_length": 636.6, "completions/min_terminated_length": 636.6, "entropy": 0.3259464383125305, "epoch": 0.8225616921269095, "frac_reward_zero_std": 0.1, "grad_norm": 0.7266973853111267, "learning_rate": 4.308214199176157e-07, "loss": 0.0302, "num_tokens": 96709727.0, "reward": 0.6774479508399963, "reward_std": 0.1927065670490265, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6774479389190674, "rewards/e2e_recall_precision_mixed_reward/std": 0.2997411698102951, "sampling/importance_sampling_ratio/max": 1.8562868118286133, "sampling/importance_sampling_ratio/mean": 0.9998578786849975, "sampling/importance_sampling_ratio/min": 0.28235826790332796, "sampling/sampling_logp_difference/max": 1.3985157489776612, "sampling/sampling_logp_difference/mean": 0.014192759618163108, "step": 700 }, { "epoch": 0.8225616921269095, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1574.08, "eval_completions/max_terminated_length": 1574.08, "eval_completions/mean_length": 1028.64625, "eval_completions/mean_terminated_length": 1028.64625, "eval_completions/min_length": 622.04, "eval_completions/min_terminated_length": 622.04, "eval_entropy": 0.33704278111457825, "eval_frac_reward_zero_std": 0.25, "eval_loss": 0.0007254349184222519, "eval_num_tokens": 96709727.0, "eval_reward": 0.6012395948171616, "eval_reward_std": 0.1713283321261406, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.6012395972013473, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.3278186982870102, "eval_runtime": 422.8508, "eval_samples_per_second": 0.236, "eval_sampling/importance_sampling_ratio/max": 1.9145189142227172, "eval_sampling/importance_sampling_ratio/mean": 1.000025644302368, "eval_sampling/importance_sampling_ratio/min": 0.3480164834856987, "eval_sampling/sampling_logp_difference/max": 1.2185939931869507, "eval_sampling/sampling_logp_difference/mean": 0.014661024622619152, "eval_steps_per_second": 0.005, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1610.8, "completions/max_terminated_length": 1610.8, "completions/mean_length": 1018.06875, "completions/mean_terminated_length": 1018.06875, "completions/min_length": 606.0, "completions/min_terminated_length": 606.0, "entropy": 0.3368810176849365, "epoch": 0.8284371327849589, "frac_reward_zero_std": 0.3, "grad_norm": 0.8269535899162292, "learning_rate": 4.3021565301671916e-07, "loss": -0.0007, "num_tokens": 97340021.0, "reward": 0.6871875166893006, "reward_std": 0.15638408213853836, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6871875166893006, "rewards/e2e_recall_precision_mixed_reward/std": 0.27981995344161986, "sampling/importance_sampling_ratio/max": 1.813429832458496, "sampling/importance_sampling_ratio/mean": 0.9999094605445862, "sampling/importance_sampling_ratio/min": 0.4124268352985382, "sampling/sampling_logp_difference/max": 0.9072500944137574, "sampling/sampling_logp_difference/mean": 0.014728988707065582, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 1579.2, "completions/max_terminated_length": 1500.6, "completions/mean_length": 974.878125, "completions/mean_terminated_length": 965.8274169921875, "completions/min_length": 631.2, "completions/min_terminated_length": 631.2, "entropy": 0.32004879117012025, "epoch": 0.8343125734430082, "frac_reward_zero_std": 0.3, "grad_norm": 0.7339820265769958, "learning_rate": 4.296098861158226e-07, "loss": -0.0021, "num_tokens": 97975014.0, "reward": 0.7150000095367431, "reward_std": 0.12368128597736358, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7150000214576722, "rewards/e2e_recall_precision_mixed_reward/std": 0.2955649644136429, "sampling/importance_sampling_ratio/max": 1.9696100234985352, "sampling/importance_sampling_ratio/mean": 0.9999983787536622, "sampling/importance_sampling_ratio/min": 0.3940297782421112, "sampling/sampling_logp_difference/max": 0.9819095611572266, "sampling/sampling_logp_difference/mean": 0.014322330988943578, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1625.6, "completions/max_terminated_length": 1625.6, "completions/mean_length": 1042.034375, "completions/mean_terminated_length": 1042.034375, "completions/min_length": 585.0, "completions/min_terminated_length": 585.0, "entropy": 0.3161555349826813, "epoch": 0.8401880141010576, "frac_reward_zero_std": 0.3, "grad_norm": 0.8281562924385071, "learning_rate": 4.290041192149261e-07, "loss": 0.003, "num_tokens": 98647761.0, "reward": 0.6562500119209289, "reward_std": 0.1516960322856903, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6562500119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.31697266101837157, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999948978424072, "sampling/importance_sampling_ratio/min": 0.34369261264801027, "sampling/sampling_logp_difference/max": 1.1635895490646362, "sampling/sampling_logp_difference/mean": 0.014115219749510289, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 1750.6, "completions/max_terminated_length": 1666.6, "completions/mean_length": 1037.1875, "completions/mean_terminated_length": 1027.6480834960937, "completions/min_length": 564.0, "completions/min_terminated_length": 564.0, "entropy": 0.3309583842754364, "epoch": 0.8460634547591069, "frac_reward_zero_std": 0.3, "grad_norm": 0.9976007342338562, "learning_rate": 4.283983523140295e-07, "loss": -0.0162, "num_tokens": 99314501.0, "reward": 0.6807812452316284, "reward_std": 0.1338688015937805, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6807812452316284, "rewards/e2e_recall_precision_mixed_reward/std": 0.3191369116306305, "sampling/importance_sampling_ratio/max": 1.9326151609420776, "sampling/importance_sampling_ratio/mean": 0.9999658942222596, "sampling/importance_sampling_ratio/min": 0.4041714251041412, "sampling/sampling_logp_difference/max": 1.0760185718536377, "sampling/sampling_logp_difference/mean": 0.01456475555896759, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1600.4, "completions/max_terminated_length": 1600.4, "completions/mean_length": 1046.459375, "completions/mean_terminated_length": 1046.459375, "completions/min_length": 649.2, "completions/min_terminated_length": 649.2, "entropy": 0.3314117074012756, "epoch": 0.8519388954171563, "frac_reward_zero_std": 0.25, "grad_norm": 0.4570290446281433, "learning_rate": 4.27792585413133e-07, "loss": 0.0001, "num_tokens": 99966632.0, "reward": 0.576354193687439, "reward_std": 0.14871049374341966, "rewards/e2e_recall_precision_mixed_reward/mean": 0.576354193687439, "rewards/e2e_recall_precision_mixed_reward/std": 0.3146588295698166, "sampling/importance_sampling_ratio/max": 1.9622956752777099, "sampling/importance_sampling_ratio/mean": 0.9999426603317261, "sampling/importance_sampling_ratio/min": 0.41331798434257505, "sampling/sampling_logp_difference/max": 0.8975541591644287, "sampling/sampling_logp_difference/mean": 0.014644469693303108, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1559.8, "completions/max_terminated_length": 1559.8, "completions/mean_length": 1027.846875, "completions/mean_terminated_length": 1027.846875, "completions/min_length": 620.4, "completions/min_terminated_length": 620.4, "entropy": 0.3280536949634552, "epoch": 0.8578143360752056, "frac_reward_zero_std": 0.2, "grad_norm": 0.6543668508529663, "learning_rate": 4.271868185122365e-07, "loss": 0.0029, "num_tokens": 100642375.0, "reward": 0.54197918176651, "reward_std": 0.17115794867277145, "rewards/e2e_recall_precision_mixed_reward/mean": 0.54197918176651, "rewards/e2e_recall_precision_mixed_reward/std": 0.36070212721824646, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000412940979004, "sampling/importance_sampling_ratio/min": 0.3607532560825348, "sampling/sampling_logp_difference/max": 1.123932695388794, "sampling/sampling_logp_difference/mean": 0.014646673388779164, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1375.8, "completions/max_terminated_length": 1375.8, "completions/mean_length": 977.796875, "completions/mean_terminated_length": 977.796875, "completions/min_length": 622.4, "completions/min_terminated_length": 622.4, "entropy": 0.31903391480445864, "epoch": 0.863689776733255, "frac_reward_zero_std": 0.3, "grad_norm": 0.9127265810966492, "learning_rate": 4.2658105161133994e-07, "loss": -0.004, "num_tokens": 101287654.0, "reward": 0.6091145873069763, "reward_std": 0.14868417531251907, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6091145873069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.2740251809358597, "sampling/importance_sampling_ratio/max": 1.9419521808624267, "sampling/importance_sampling_ratio/mean": 1.0001082062721252, "sampling/importance_sampling_ratio/min": 0.39746988415718076, "sampling/sampling_logp_difference/max": 1.070701003074646, "sampling/sampling_logp_difference/mean": 0.014321417547762394, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1902.4, "completions/max_terminated_length": 1772.4, "completions/mean_length": 1019.028125, "completions/mean_terminated_length": 1014.390185546875, "completions/min_length": 597.6, "completions/min_terminated_length": 597.6, "entropy": 0.2945869266986847, "epoch": 0.8695652173913043, "frac_reward_zero_std": 0.05, "grad_norm": 1.0506489276885986, "learning_rate": 4.2597528471044343e-07, "loss": 0.0036, "num_tokens": 101951611.0, "reward": 0.6753125071525574, "reward_std": 0.19040150046348572, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6753125190734863, "rewards/e2e_recall_precision_mixed_reward/std": 0.29441867470741273, "sampling/importance_sampling_ratio/max": 1.9205948114395142, "sampling/importance_sampling_ratio/mean": 0.9999702334403991, "sampling/importance_sampling_ratio/min": 0.2421664908528328, "sampling/sampling_logp_difference/max": 1.5547382235527039, "sampling/sampling_logp_difference/mean": 0.013843800313770771, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1479.0, "completions/max_terminated_length": 1479.0, "completions/mean_length": 976.290625, "completions/mean_terminated_length": 976.290625, "completions/min_length": 609.4, "completions/min_terminated_length": 609.4, "entropy": 0.2982940495014191, "epoch": 0.8754406580493537, "frac_reward_zero_std": 0.25, "grad_norm": 0.7624930143356323, "learning_rate": 4.2536951780954687e-07, "loss": 0.0197, "num_tokens": 102593608.0, "reward": 0.6481770992279052, "reward_std": 0.16066179424524307, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6481770992279052, "rewards/e2e_recall_precision_mixed_reward/std": 0.28471835851669314, "sampling/importance_sampling_ratio/max": 1.9874407529830933, "sampling/importance_sampling_ratio/mean": 0.9999994158744812, "sampling/importance_sampling_ratio/min": 0.4114152193069458, "sampling/sampling_logp_difference/max": 0.9713044166564941, "sampling/sampling_logp_difference/mean": 0.013807599991559982, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1364.0, "completions/max_terminated_length": 1364.0, "completions/mean_length": 979.490625, "completions/mean_terminated_length": 979.490625, "completions/min_length": 616.0, "completions/min_terminated_length": 616.0, "entropy": 0.29739575982093813, "epoch": 0.881316098707403, "frac_reward_zero_std": 0.25, "grad_norm": 0.9040115475654602, "learning_rate": 4.2476375090865036e-07, "loss": 0.0015, "num_tokens": 103239925.0, "reward": 0.5628646016120911, "reward_std": 0.14127539545297624, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5628646016120911, "rewards/e2e_recall_precision_mixed_reward/std": 0.3246196061372757, "sampling/importance_sampling_ratio/max": 1.9362622261047364, "sampling/importance_sampling_ratio/mean": 0.9999957323074341, "sampling/importance_sampling_ratio/min": 0.45249093174934385, "sampling/sampling_logp_difference/max": 0.9302958488464356, "sampling/sampling_logp_difference/mean": 0.013709403574466705, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1269.0, "completions/max_terminated_length": 1269.0, "completions/mean_length": 941.628125, "completions/mean_terminated_length": 941.628125, "completions/min_length": 668.6, "completions/min_terminated_length": 668.6, "entropy": 0.2873253464698792, "epoch": 0.8871915393654524, "frac_reward_zero_std": 0.4, "grad_norm": 0.7270746231079102, "learning_rate": 4.2415798400775385e-07, "loss": -0.0041, "num_tokens": 103848574.0, "reward": 0.6278125166893005, "reward_std": 0.13141954243183135, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6278125166893005, "rewards/e2e_recall_precision_mixed_reward/std": 0.20140623152256013, "sampling/importance_sampling_ratio/max": 1.9731908082962035, "sampling/importance_sampling_ratio/mean": 0.9999493479728698, "sampling/importance_sampling_ratio/min": 0.29442899525165556, "sampling/sampling_logp_difference/max": 1.5388960361480712, "sampling/sampling_logp_difference/mean": 0.013377194851636886, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1579.2, "completions/max_terminated_length": 1468.4, "completions/mean_length": 1000.2125, "completions/mean_terminated_length": 995.6033935546875, "completions/min_length": 662.0, "completions/min_terminated_length": 662.0, "entropy": 0.28227120637893677, "epoch": 0.8930669800235017, "frac_reward_zero_std": 0.15, "grad_norm": 0.8500383496284485, "learning_rate": 4.2355221710685723e-07, "loss": 0.0123, "num_tokens": 104456958.0, "reward": 0.6193229198455811, "reward_std": 0.2092406004667282, "rewards/e2e_recall_precision_mixed_reward/mean": 0.61932293176651, "rewards/e2e_recall_precision_mixed_reward/std": 0.31036264896392823, "sampling/importance_sampling_ratio/max": 1.9621197700500488, "sampling/importance_sampling_ratio/mean": 0.9999842524528504, "sampling/importance_sampling_ratio/min": 0.37887851893901825, "sampling/sampling_logp_difference/max": 1.138273000717163, "sampling/sampling_logp_difference/mean": 0.013238861598074437, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1348.0, "completions/max_terminated_length": 1348.0, "completions/mean_length": 896.709375, "completions/mean_terminated_length": 896.709375, "completions/min_length": 568.8, "completions/min_terminated_length": 568.8, "entropy": 0.29374930262565613, "epoch": 0.8989424206815512, "frac_reward_zero_std": 0.2, "grad_norm": 0.9952046871185303, "learning_rate": 4.229464502059607e-07, "loss": -0.006, "num_tokens": 105078945.0, "reward": 0.6104687571525573, "reward_std": 0.16176794916391374, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6104687750339508, "rewards/e2e_recall_precision_mixed_reward/std": 0.2711296409368515, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000908970832825, "sampling/importance_sampling_ratio/min": 0.32102622333914044, "sampling/sampling_logp_difference/max": 2.261718225479126, "sampling/sampling_logp_difference/mean": 0.01387611273676157, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1570.8, "completions/max_terminated_length": 1437.8, "completions/mean_length": 943.59375, "completions/mean_terminated_length": 938.8944580078125, "completions/min_length": 564.8, "completions/min_terminated_length": 564.8, "entropy": 0.281459829211235, "epoch": 0.9048178613396005, "frac_reward_zero_std": 0.15, "grad_norm": 0.8672675490379333, "learning_rate": 4.2234068330506415e-07, "loss": 0.0008, "num_tokens": 105700507.0, "reward": 0.5471354305744172, "reward_std": 0.19499558210372925, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5471354305744172, "rewards/e2e_recall_precision_mixed_reward/std": 0.3031169354915619, "sampling/importance_sampling_ratio/max": 1.986432147026062, "sampling/importance_sampling_ratio/mean": 0.999950909614563, "sampling/importance_sampling_ratio/min": 0.22982776015996934, "sampling/sampling_logp_difference/max": 1.6226904392242432, "sampling/sampling_logp_difference/mean": 0.013240163028240205, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1298.8, "completions/max_terminated_length": 1298.8, "completions/mean_length": 922.871875, "completions/mean_terminated_length": 922.871875, "completions/min_length": 498.8, "completions/min_terminated_length": 498.8, "entropy": 0.2916738152503967, "epoch": 0.9106933019976499, "frac_reward_zero_std": 0.15, "grad_norm": 0.9539207220077515, "learning_rate": 4.2173491640416764e-07, "loss": 0.0091, "num_tokens": 106338834.0, "reward": 0.5891145884990692, "reward_std": 0.19707020074129106, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5891146004199982, "rewards/e2e_recall_precision_mixed_reward/std": 0.347181960940361, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999728560447693, "sampling/importance_sampling_ratio/min": 0.4203726887702942, "sampling/sampling_logp_difference/max": 1.1795375108718873, "sampling/sampling_logp_difference/mean": 0.013758598454296589, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1438.6, "completions/max_terminated_length": 1438.6, "completions/mean_length": 934.446875, "completions/mean_terminated_length": 934.446875, "completions/min_length": 561.4, "completions/min_terminated_length": 561.4, "entropy": 0.28705472946166993, "epoch": 0.9165687426556992, "frac_reward_zero_std": 0.4, "grad_norm": 0.8895977139472961, "learning_rate": 4.2112914950327113e-07, "loss": 0.0087, "num_tokens": 107003489.0, "reward": 0.5985937595367432, "reward_std": 0.12468727231025696, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5985937595367432, "rewards/e2e_recall_precision_mixed_reward/std": 0.36896389722824097, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000386953353881, "sampling/importance_sampling_ratio/min": 0.3284500002861023, "sampling/sampling_logp_difference/max": 1.1372824430465698, "sampling/sampling_logp_difference/mean": 0.013641990534961224, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1454.8, "completions/max_terminated_length": 1454.8, "completions/mean_length": 969.2, "completions/mean_terminated_length": 969.2, "completions/min_length": 629.2, "completions/min_terminated_length": 629.2, "entropy": 0.280923467874527, "epoch": 0.9224441833137486, "frac_reward_zero_std": 0.2, "grad_norm": 0.6472572088241577, "learning_rate": 4.2052338260237457e-07, "loss": 0.0042, "num_tokens": 107634865.0, "reward": 0.7227083444595337, "reward_std": 0.16849884390830994, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7227083444595337, "rewards/e2e_recall_precision_mixed_reward/std": 0.3031183272600174, "sampling/importance_sampling_ratio/max": 1.7831587314605712, "sampling/importance_sampling_ratio/mean": 0.9999613523483276, "sampling/importance_sampling_ratio/min": 0.37517508268356325, "sampling/sampling_logp_difference/max": 0.9865343809127808, "sampling/sampling_logp_difference/mean": 0.013296735659241676, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1424.4, "completions/max_terminated_length": 1424.4, "completions/mean_length": 945.209375, "completions/mean_terminated_length": 945.209375, "completions/min_length": 593.8, "completions/min_terminated_length": 593.8, "entropy": 0.28385655879974364, "epoch": 0.9283196239717979, "frac_reward_zero_std": 0.15, "grad_norm": 0.8925604820251465, "learning_rate": 4.1991761570147806e-07, "loss": -0.0085, "num_tokens": 108281476.0, "reward": 0.7040104150772095, "reward_std": 0.17787585258483887, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7040104150772095, "rewards/e2e_recall_precision_mixed_reward/std": 0.2852801322937012, "sampling/importance_sampling_ratio/max": 1.9631269216537475, "sampling/importance_sampling_ratio/mean": 1.000104796886444, "sampling/importance_sampling_ratio/min": 0.2682963252067566, "sampling/sampling_logp_difference/max": 1.5255920171737671, "sampling/sampling_logp_difference/mean": 0.013569644838571548, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1297.4, "completions/max_terminated_length": 1297.4, "completions/mean_length": 942.734375, "completions/mean_terminated_length": 942.734375, "completions/min_length": 572.8, "completions/min_terminated_length": 572.8, "entropy": 0.28979550004005433, "epoch": 0.9341950646298472, "frac_reward_zero_std": 0.4, "grad_norm": 0.667854905128479, "learning_rate": 4.193118488005815e-07, "loss": 0.0068, "num_tokens": 108890495.0, "reward": 0.5463541865348815, "reward_std": 0.12234707921743393, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5463541865348815, "rewards/e2e_recall_precision_mixed_reward/std": 0.35137582421302793, "sampling/importance_sampling_ratio/max": 1.9598425388336183, "sampling/importance_sampling_ratio/mean": 1.0001339554786681, "sampling/importance_sampling_ratio/min": 0.37145259976387024, "sampling/sampling_logp_difference/max": 1.0230091214179993, "sampling/sampling_logp_difference/mean": 0.01373386587947607, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1364.6, "completions/max_terminated_length": 1364.6, "completions/mean_length": 996.2625, "completions/mean_terminated_length": 996.2625, "completions/min_length": 694.0, "completions/min_terminated_length": 694.0, "entropy": 0.27299547791481016, "epoch": 0.9400705052878966, "frac_reward_zero_std": 0.4, "grad_norm": 0.45995670557022095, "learning_rate": 4.18706081899685e-07, "loss": 0.0054, "num_tokens": 109527747.0, "reward": 0.5731770873069764, "reward_std": 0.12794461846351624, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5731770873069764, "rewards/e2e_recall_precision_mixed_reward/std": 0.31133740544319155, "sampling/importance_sampling_ratio/max": 1.9356085777282714, "sampling/importance_sampling_ratio/mean": 0.9999878287315369, "sampling/importance_sampling_ratio/min": 0.4067450284957886, "sampling/sampling_logp_difference/max": 0.9950890302658081, "sampling/sampling_logp_difference/mean": 0.012918901070952416, "step": 800 }, { "epoch": 0.9400705052878966, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1378.96, "eval_completions/max_terminated_length": 1378.96, "eval_completions/mean_length": 902.095625, "eval_completions/mean_terminated_length": 902.095625, "eval_completions/min_length": 568.84, "eval_completions/min_terminated_length": 568.84, "eval_entropy": 0.2893510788679123, "eval_frac_reward_zero_std": 0.3, "eval_loss": 0.0025282795540988445, "eval_num_tokens": 109527747.0, "eval_reward": 0.6336666762828826, "eval_reward_std": 0.1548786437511444, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.6336666780710221, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.3238634419441223, "eval_runtime": 369.8544, "eval_samples_per_second": 0.27, "eval_sampling/importance_sampling_ratio/max": 1.9275440073013306, "eval_sampling/importance_sampling_ratio/mean": 0.9999588131904602, "eval_sampling/importance_sampling_ratio/min": 0.3401543361693621, "eval_sampling/sampling_logp_difference/max": 1.2501714992523194, "eval_sampling/sampling_logp_difference/mean": 0.013715669251978398, "eval_steps_per_second": 0.005, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1472.6, "completions/max_terminated_length": 1472.6, "completions/mean_length": 959.875, "completions/mean_terminated_length": 959.875, "completions/min_length": 569.0, "completions/min_terminated_length": 569.0, "entropy": 0.30030797719955443, "epoch": 0.9459459459459459, "frac_reward_zero_std": 0.25, "grad_norm": 0.8915446996688843, "learning_rate": 4.181003149987885e-07, "loss": 0.0076, "num_tokens": 110157995.0, "reward": 0.6484375119209289, "reward_std": 0.16320026628673076, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6484375119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.34366492331027987, "sampling/importance_sampling_ratio/max": 1.9439698219299317, "sampling/importance_sampling_ratio/mean": 1.0000813722610473, "sampling/importance_sampling_ratio/min": 0.28806779980659486, "sampling/sampling_logp_difference/max": 1.340969157218933, "sampling/sampling_logp_difference/mean": 0.013933260180056094, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1385.6, "completions/max_terminated_length": 1385.6, "completions/mean_length": 937.865625, "completions/mean_terminated_length": 937.865625, "completions/min_length": 572.4, "completions/min_terminated_length": 572.4, "entropy": 0.2845805108547211, "epoch": 0.9518213866039953, "frac_reward_zero_std": 0.25, "grad_norm": 0.8561508655548096, "learning_rate": 4.174945480978919e-07, "loss": -0.0071, "num_tokens": 110759248.0, "reward": 0.6078125178813935, "reward_std": 0.1593657538294792, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6078125178813935, "rewards/e2e_recall_precision_mixed_reward/std": 0.32861425280570983, "sampling/importance_sampling_ratio/max": 1.9477681398391724, "sampling/importance_sampling_ratio/mean": 1.000069797039032, "sampling/importance_sampling_ratio/min": 0.36302418559789656, "sampling/sampling_logp_difference/max": 1.2235562324523925, "sampling/sampling_logp_difference/mean": 0.01336444988846779, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1476.8, "completions/max_terminated_length": 1476.8, "completions/mean_length": 894.56875, "completions/mean_terminated_length": 894.56875, "completions/min_length": 505.6, "completions/min_terminated_length": 505.6, "entropy": 0.2862889677286148, "epoch": 0.9576968272620446, "frac_reward_zero_std": 0.3, "grad_norm": 0.8311812281608582, "learning_rate": 4.168887811969954e-07, "loss": 0.0094, "num_tokens": 111360918.0, "reward": 0.6596875190734863, "reward_std": 0.1476286917924881, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6596875190734863, "rewards/e2e_recall_precision_mixed_reward/std": 0.3123969078063965, "sampling/importance_sampling_ratio/max": 1.830119013786316, "sampling/importance_sampling_ratio/mean": 0.9999667048454285, "sampling/importance_sampling_ratio/min": 0.28043819926679137, "sampling/sampling_logp_difference/max": 1.6224053859710694, "sampling/sampling_logp_difference/mean": 0.013792328163981437, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1540.8, "completions/max_terminated_length": 1540.8, "completions/mean_length": 922.28125, "completions/mean_terminated_length": 922.28125, "completions/min_length": 522.4, "completions/min_terminated_length": 522.4, "entropy": 0.28678136467933657, "epoch": 0.963572267920094, "frac_reward_zero_std": 0.25, "grad_norm": 0.9352256059646606, "learning_rate": 4.1628301429609884e-07, "loss": -0.0143, "num_tokens": 111999568.0, "reward": 0.669218772649765, "reward_std": 0.13827429413795472, "rewards/e2e_recall_precision_mixed_reward/mean": 0.669218772649765, "rewards/e2e_recall_precision_mixed_reward/std": 0.29794834554195404, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000133991241455, "sampling/importance_sampling_ratio/min": 0.31264116019010546, "sampling/sampling_logp_difference/max": 1.346689224243164, "sampling/sampling_logp_difference/mean": 0.014031770080327988, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1431.0, "completions/max_terminated_length": 1431.0, "completions/mean_length": 906.35625, "completions/mean_terminated_length": 906.35625, "completions/min_length": 596.6, "completions/min_terminated_length": 596.6, "entropy": 0.2764395475387573, "epoch": 0.9694477085781433, "frac_reward_zero_std": 0.5, "grad_norm": 0.4546630382537842, "learning_rate": 4.156772473952023e-07, "loss": 0.0027, "num_tokens": 112621858.0, "reward": 0.6304166793823243, "reward_std": 0.11817457228899002, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6304166793823243, "rewards/e2e_recall_precision_mixed_reward/std": 0.29249594509601595, "sampling/importance_sampling_ratio/max": 1.9534301519393922, "sampling/importance_sampling_ratio/mean": 1.000028908252716, "sampling/importance_sampling_ratio/min": 0.37757673263549807, "sampling/sampling_logp_difference/max": 1.0168023586273194, "sampling/sampling_logp_difference/mean": 0.013349436223506927, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1652.2, "completions/max_terminated_length": 1420.2, "completions/mean_length": 914.15625, "completions/mean_terminated_length": 909.2442016601562, "completions/min_length": 581.6, "completions/min_terminated_length": 581.6, "entropy": 0.2829986423254013, "epoch": 0.9753231492361927, "frac_reward_zero_std": 0.35, "grad_norm": 0.637842059135437, "learning_rate": 4.150714804943058e-07, "loss": -0.0096, "num_tokens": 113231744.0, "reward": 0.6623437643051148, "reward_std": 0.13550456166267394, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6623437643051148, "rewards/e2e_recall_precision_mixed_reward/std": 0.31752758026123046, "sampling/importance_sampling_ratio/max": 1.9561806201934815, "sampling/importance_sampling_ratio/mean": 0.9999186754226684, "sampling/importance_sampling_ratio/min": 0.19964413106240678, "sampling/sampling_logp_difference/max": 5.088151931762695, "sampling/sampling_logp_difference/mean": 0.013989451713860035, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1466.2, "completions/max_terminated_length": 1466.2, "completions/mean_length": 941.5, "completions/mean_terminated_length": 941.5, "completions/min_length": 533.6, "completions/min_terminated_length": 533.6, "entropy": 0.28082106113433836, "epoch": 0.981198589894242, "frac_reward_zero_std": 0.05, "grad_norm": 0.9239412546157837, "learning_rate": 4.1446571359340925e-07, "loss": 0.0049, "num_tokens": 113838384.0, "reward": 0.6610416769981384, "reward_std": 0.1860102355480194, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6610416769981384, "rewards/e2e_recall_precision_mixed_reward/std": 0.28068467378616335, "sampling/importance_sampling_ratio/max": 1.9927521705627442, "sampling/importance_sampling_ratio/mean": 1.000089454650879, "sampling/importance_sampling_ratio/min": 0.3198125422000885, "sampling/sampling_logp_difference/max": 1.2942631721496582, "sampling/sampling_logp_difference/mean": 0.013557923585176468, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1497.0, "completions/max_terminated_length": 1497.0, "completions/mean_length": 976.396875, "completions/mean_terminated_length": 976.396875, "completions/min_length": 507.6, "completions/min_terminated_length": 507.6, "entropy": 0.2998843610286713, "epoch": 0.9870740305522914, "frac_reward_zero_std": 0.4, "grad_norm": 0.7953653335571289, "learning_rate": 4.1385994669251274e-07, "loss": -0.0011, "num_tokens": 114463567.0, "reward": 0.6831770896911621, "reward_std": 0.16244979202747345, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6831770896911621, "rewards/e2e_recall_precision_mixed_reward/std": 0.28926865458488465, "sampling/importance_sampling_ratio/max": 1.926166844367981, "sampling/importance_sampling_ratio/mean": 1.0001335382461547, "sampling/importance_sampling_ratio/min": 0.40983167886734007, "sampling/sampling_logp_difference/max": 0.9074871063232421, "sampling/sampling_logp_difference/mean": 0.014117139205336571, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1299.6, "completions/max_terminated_length": 1299.6, "completions/mean_length": 826.9625, "completions/mean_terminated_length": 826.9625, "completions/min_length": 363.6, "completions/min_terminated_length": 363.6, "entropy": 0.2793893039226532, "epoch": 0.9929494712103408, "frac_reward_zero_std": 0.35, "grad_norm": 0.5206387639045715, "learning_rate": 4.132541797916161e-07, "loss": -0.0096, "num_tokens": 115050275.0, "reward": 0.7274479269981384, "reward_std": 0.1434938132762909, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7274479269981384, "rewards/e2e_recall_precision_mixed_reward/std": 0.2630143046379089, "sampling/importance_sampling_ratio/max": 1.9405413150787354, "sampling/importance_sampling_ratio/mean": 1.000065279006958, "sampling/importance_sampling_ratio/min": 0.4045724630355835, "sampling/sampling_logp_difference/max": 0.9253458499908447, "sampling/sampling_logp_difference/mean": 0.013940737955272198, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1406.2, "completions/max_terminated_length": 1406.2, "completions/mean_length": 959.43125, "completions/mean_terminated_length": 959.43125, "completions/min_length": 589.2, "completions/min_terminated_length": 589.2, "entropy": 0.28270782232284547, "epoch": 0.9988249118683902, "frac_reward_zero_std": 0.35, "grad_norm": 0.46818044781684875, "learning_rate": 4.126484128907196e-07, "loss": -0.0087, "num_tokens": 115656285.0, "reward": 0.6919270932674408, "reward_std": 0.12568674832582474, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6919270932674408, "rewards/e2e_recall_precision_mixed_reward/std": 0.3104670524597168, "sampling/importance_sampling_ratio/max": 1.931183409690857, "sampling/importance_sampling_ratio/mean": 1.000103223323822, "sampling/importance_sampling_ratio/min": 0.36354232132434844, "sampling/sampling_logp_difference/max": 1.114108383655548, "sampling/sampling_logp_difference/mean": 0.013803380355238915, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1560.6, "completions/max_terminated_length": 1560.6, "completions/mean_length": 1058.790625, "completions/mean_terminated_length": 1058.790625, "completions/min_length": 636.6, "completions/min_terminated_length": 636.6, "entropy": 0.2910564005374908, "epoch": 1.0047003525264395, "frac_reward_zero_std": 0.3, "grad_norm": 0.8775485754013062, "learning_rate": 4.120426459898231e-07, "loss": 0.0008, "num_tokens": 116329418.0, "reward": 0.6715104341506958, "reward_std": 0.16097910925745965, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6715104341506958, "rewards/e2e_recall_precision_mixed_reward/std": 0.2587837889790535, "sampling/importance_sampling_ratio/max": 1.9636148691177369, "sampling/importance_sampling_ratio/mean": 1.0000767588615418, "sampling/importance_sampling_ratio/min": 0.3296537220478058, "sampling/sampling_logp_difference/max": 1.5510367155075073, "sampling/sampling_logp_difference/mean": 0.013851667195558548, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1445.8, "completions/max_terminated_length": 1445.8, "completions/mean_length": 930.640625, "completions/mean_terminated_length": 930.640625, "completions/min_length": 546.2, "completions/min_terminated_length": 546.2, "entropy": 0.2729405462741852, "epoch": 1.0105757931844888, "frac_reward_zero_std": 0.15, "grad_norm": 0.7856388688087463, "learning_rate": 4.1143687908892654e-07, "loss": -0.0039, "num_tokens": 116934247.0, "reward": 0.795677101612091, "reward_std": 0.19950651228427888, "rewards/e2e_recall_precision_mixed_reward/mean": 0.795677101612091, "rewards/e2e_recall_precision_mixed_reward/std": 0.28640814423561095, "sampling/importance_sampling_ratio/max": 1.9782800912857055, "sampling/importance_sampling_ratio/mean": 1.0000068426132203, "sampling/importance_sampling_ratio/min": 0.2158554643392563, "sampling/sampling_logp_difference/max": 2.2541099786758423, "sampling/sampling_logp_difference/mean": 0.013442268781363963, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 1620.4, "completions/max_terminated_length": 1598.0, "completions/mean_length": 1019.85625, "completions/mean_terminated_length": 1012.270751953125, "completions/min_length": 617.4, "completions/min_terminated_length": 617.4, "entropy": 0.2991854906082153, "epoch": 1.0164512338425382, "frac_reward_zero_std": 0.35, "grad_norm": 0.5161347389221191, "learning_rate": 4.1083111218803003e-07, "loss": -0.0024, "num_tokens": 117587905.0, "reward": 0.5876562565565109, "reward_std": 0.12923510670661925, "rewards/e2e_recall_precision_mixed_reward/mean": 0.5876562565565109, "rewards/e2e_recall_precision_mixed_reward/std": 0.2704192191362381, "sampling/importance_sampling_ratio/max": 1.9975411891937256, "sampling/importance_sampling_ratio/mean": 1.000113844871521, "sampling/importance_sampling_ratio/min": 0.3002059832215309, "sampling/sampling_logp_difference/max": 1.312308168411255, "sampling/sampling_logp_difference/mean": 0.014147781021893024, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1463.4, "completions/max_terminated_length": 1463.4, "completions/mean_length": 978.790625, "completions/mean_terminated_length": 978.790625, "completions/min_length": 589.6, "completions/min_terminated_length": 589.6, "entropy": 0.2838165521621704, "epoch": 1.0223266745005875, "frac_reward_zero_std": 0.2, "grad_norm": 0.6190516948699951, "learning_rate": 4.1022534528713347e-07, "loss": 0.0094, "num_tokens": 118222606.0, "reward": 0.6153125166893005, "reward_std": 0.1697417378425598, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6153125166893005, "rewards/e2e_recall_precision_mixed_reward/std": 0.3247655272483826, "sampling/importance_sampling_ratio/max": 1.9137526273727417, "sampling/importance_sampling_ratio/mean": 0.9999405384063721, "sampling/importance_sampling_ratio/min": 0.33550558388233187, "sampling/sampling_logp_difference/max": 1.268647813796997, "sampling/sampling_logp_difference/mean": 0.013852118141949176, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1434.4, "completions/max_terminated_length": 1434.4, "completions/mean_length": 970.478125, "completions/mean_terminated_length": 970.478125, "completions/min_length": 625.4, "completions/min_terminated_length": 625.4, "entropy": 0.28675009608268737, "epoch": 1.028202115158637, "frac_reward_zero_std": 0.2, "grad_norm": 0.9456232786178589, "learning_rate": 4.0961957838623695e-07, "loss": -0.0062, "num_tokens": 118862071.0, "reward": 0.6469270884990692, "reward_std": 0.160466830432415, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6469270884990692, "rewards/e2e_recall_precision_mixed_reward/std": 0.3246330052614212, "sampling/importance_sampling_ratio/max": 1.9722527265548706, "sampling/importance_sampling_ratio/mean": 0.999930226802826, "sampling/importance_sampling_ratio/min": 0.3685579001903534, "sampling/sampling_logp_difference/max": 1.2024194717407226, "sampling/sampling_logp_difference/mean": 0.013803689368069172, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1432.0, "completions/max_terminated_length": 1432.0, "completions/mean_length": 965.79375, "completions/mean_terminated_length": 965.79375, "completions/min_length": 664.2, "completions/min_terminated_length": 664.2, "entropy": 0.2727407574653625, "epoch": 1.0340775558166861, "frac_reward_zero_std": 0.25, "grad_norm": 0.7059847712516785, "learning_rate": 4.0901381148534044e-07, "loss": 0.0025, "num_tokens": 119470421.0, "reward": 0.7513020873069763, "reward_std": 0.1561640739440918, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7513020873069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.29153428971767426, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999707937240601, "sampling/importance_sampling_ratio/min": 0.3273321449756622, "sampling/sampling_logp_difference/max": 1.3830796003341674, "sampling/sampling_logp_difference/mean": 0.013064392656087876, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1410.0, "completions/max_terminated_length": 1410.0, "completions/mean_length": 874.78125, "completions/mean_terminated_length": 874.78125, "completions/min_length": 512.2, "completions/min_terminated_length": 512.2, "entropy": 0.27989777326583865, "epoch": 1.0399529964747356, "frac_reward_zero_std": 0.55, "grad_norm": 8.109946250915527, "learning_rate": 4.084080445844439e-07, "loss": 0.0028, "num_tokens": 120109087.0, "reward": 0.612500011920929, "reward_std": 0.09955177009105683, "rewards/e2e_recall_precision_mixed_reward/mean": 0.612500011920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.3614356517791748, "sampling/importance_sampling_ratio/max": 1.9737717866897584, "sampling/importance_sampling_ratio/mean": 1.0000153660774231, "sampling/importance_sampling_ratio/min": 0.4128256618976593, "sampling/sampling_logp_difference/max": 0.9954976558685302, "sampling/sampling_logp_difference/mean": 0.013704303652048111, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1469.0, "completions/max_terminated_length": 1469.0, "completions/mean_length": 987.765625, "completions/mean_terminated_length": 987.765625, "completions/min_length": 618.6, "completions/min_terminated_length": 618.6, "entropy": 0.28374720811843873, "epoch": 1.045828437132785, "frac_reward_zero_std": 0.4, "grad_norm": 0.6571109890937805, "learning_rate": 4.0780227768354737e-07, "loss": 0.0109, "num_tokens": 120751044.0, "reward": 0.686614590883255, "reward_std": 0.12360157519578933, "rewards/e2e_recall_precision_mixed_reward/mean": 0.686614590883255, "rewards/e2e_recall_precision_mixed_reward/std": 0.2697263121604919, "sampling/importance_sampling_ratio/max": 1.9664803504943849, "sampling/importance_sampling_ratio/mean": 1.0000463843345642, "sampling/importance_sampling_ratio/min": 0.2772391699254513, "sampling/sampling_logp_difference/max": 1.5622613430023193, "sampling/sampling_logp_difference/mean": 0.013510177657008172, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1600.8, "completions/max_terminated_length": 1600.8, "completions/mean_length": 982.44375, "completions/mean_terminated_length": 982.44375, "completions/min_length": 499.2, "completions/min_terminated_length": 499.2, "entropy": 0.2803249657154083, "epoch": 1.0517038777908343, "frac_reward_zero_std": 0.3, "grad_norm": 0.8509756326675415, "learning_rate": 4.071965107826508e-07, "loss": 0.0015, "num_tokens": 121402178.0, "reward": 0.6618229389190674, "reward_std": 0.13814076781272888, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6618229389190674, "rewards/e2e_recall_precision_mixed_reward/std": 0.3093242943286896, "sampling/importance_sampling_ratio/max": 1.9148869276046754, "sampling/importance_sampling_ratio/mean": 0.9999146819114685, "sampling/importance_sampling_ratio/min": 0.3448833405971527, "sampling/sampling_logp_difference/max": 1.3441654443740845, "sampling/sampling_logp_difference/mean": 0.013470856286585332, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1447.6, "completions/max_terminated_length": 1447.6, "completions/mean_length": 971.4125, "completions/mean_terminated_length": 971.4125, "completions/min_length": 522.4, "completions/min_terminated_length": 522.4, "entropy": 0.28140476942062376, "epoch": 1.0575793184488838, "frac_reward_zero_std": 0.45, "grad_norm": 0.7652623057365417, "learning_rate": 4.065907438817543e-07, "loss": 0.0049, "num_tokens": 122006054.0, "reward": 0.7295312762260437, "reward_std": 0.10282711908221245, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7295312762260437, "rewards/e2e_recall_precision_mixed_reward/std": 0.28697769343852997, "sampling/importance_sampling_ratio/max": 1.9570077180862426, "sampling/importance_sampling_ratio/mean": 0.9998312950134277, "sampling/importance_sampling_ratio/min": 0.35704835057258605, "sampling/sampling_logp_difference/max": 1.3200425148010253, "sampling/sampling_logp_difference/mean": 0.013536373898386956, "step": 900 }, { "epoch": 1.0575793184488838, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1455.64, "eval_completions/max_terminated_length": 1455.64, "eval_completions/mean_length": 907.501875, "eval_completions/mean_terminated_length": 907.501875, "eval_completions/min_length": 554.48, "eval_completions/min_terminated_length": 554.48, "eval_entropy": 0.2828002864122391, "eval_frac_reward_zero_std": 0.41, "eval_loss": 0.0018464005552232265, "eval_num_tokens": 122006054.0, "eval_reward": 0.6610416805744171, "eval_reward_std": 0.12711612805724143, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.6610416841506958, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.3318196302652359, "eval_runtime": 377.4818, "eval_samples_per_second": 0.265, "eval_sampling/importance_sampling_ratio/max": 1.9399487686157226, "eval_sampling/importance_sampling_ratio/mean": 1.0000247478485107, "eval_sampling/importance_sampling_ratio/min": 0.34321905925869944, "eval_sampling/sampling_logp_difference/max": 1.2090392780303956, "eval_sampling/sampling_logp_difference/mean": 0.013470363169908524, "eval_steps_per_second": 0.005, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1200.4, "completions/max_terminated_length": 1200.4, "completions/mean_length": 885.95625, "completions/mean_terminated_length": 885.95625, "completions/min_length": 559.6, "completions/min_terminated_length": 559.6, "entropy": 0.2904858112335205, "epoch": 1.063454759106933, "frac_reward_zero_std": 0.4, "grad_norm": 0.5744009613990784, "learning_rate": 4.059849769808578e-07, "loss": 0.0104, "num_tokens": 122606664.0, "reward": 0.7563021063804627, "reward_std": 0.12077359333634377, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7563021063804627, "rewards/e2e_recall_precision_mixed_reward/std": 0.25939937233924865, "sampling/importance_sampling_ratio/max": 1.9366278409957887, "sampling/importance_sampling_ratio/mean": 1.0000674724578857, "sampling/importance_sampling_ratio/min": 0.3904744863510132, "sampling/sampling_logp_difference/max": 1.0748745203018188, "sampling/sampling_logp_difference/mean": 0.013862324692308903, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1405.6, "completions/max_terminated_length": 1405.6, "completions/mean_length": 947.584375, "completions/mean_terminated_length": 947.584375, "completions/min_length": 553.2, "completions/min_terminated_length": 553.2, "entropy": 0.28678281903266906, "epoch": 1.0693301997649824, "frac_reward_zero_std": 0.25, "grad_norm": 0.8974121809005737, "learning_rate": 4.053792100799612e-07, "loss": 0.0017, "num_tokens": 123210147.0, "reward": 0.6382291734218597, "reward_std": 0.15600190162658692, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6382291734218597, "rewards/e2e_recall_precision_mixed_reward/std": 0.3487572968006134, "sampling/importance_sampling_ratio/max": 1.9583675384521484, "sampling/importance_sampling_ratio/mean": 0.9999847769737243, "sampling/importance_sampling_ratio/min": 0.3770772695541382, "sampling/sampling_logp_difference/max": 1.145704698562622, "sampling/sampling_logp_difference/mean": 0.013637512736022473, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1463.6, "completions/max_terminated_length": 1463.6, "completions/mean_length": 1009.46875, "completions/mean_terminated_length": 1009.46875, "completions/min_length": 643.2, "completions/min_terminated_length": 643.2, "entropy": 0.2971984803676605, "epoch": 1.0752056404230317, "frac_reward_zero_std": 0.25, "grad_norm": 0.7462270259857178, "learning_rate": 4.047734431790647e-07, "loss": -0.0047, "num_tokens": 123839481.0, "reward": 0.7052083373069763, "reward_std": 0.16606248915195465, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7052083373069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.27336723506450655, "sampling/importance_sampling_ratio/max": 1.9635993242263794, "sampling/importance_sampling_ratio/mean": 1.0000979065895081, "sampling/importance_sampling_ratio/min": 0.313095235824585, "sampling/sampling_logp_difference/max": 1.3117567539215087, "sampling/sampling_logp_difference/mean": 0.013715284876525402, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1282.0, "completions/max_terminated_length": 1282.0, "completions/mean_length": 916.96875, "completions/mean_terminated_length": 916.96875, "completions/min_length": 545.4, "completions/min_terminated_length": 545.4, "entropy": 0.26898905336856843, "epoch": 1.0810810810810811, "frac_reward_zero_std": 0.5, "grad_norm": 0.0, "learning_rate": 4.0416767627816815e-07, "loss": -0.0374, "num_tokens": 124465855.0, "reward": 0.7244791984558105, "reward_std": 0.0911778524518013, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7244791984558105, "rewards/e2e_recall_precision_mixed_reward/std": 0.2757032006978989, "sampling/importance_sampling_ratio/max": 1.7921949625015259, "sampling/importance_sampling_ratio/mean": 0.9998450398445129, "sampling/importance_sampling_ratio/min": 0.2510555416345596, "sampling/sampling_logp_difference/max": 1.4895573616027833, "sampling/sampling_logp_difference/mean": 0.013172058574855327, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1380.6, "completions/max_terminated_length": 1380.6, "completions/mean_length": 949.85625, "completions/mean_terminated_length": 949.85625, "completions/min_length": 602.4, "completions/min_terminated_length": 602.4, "entropy": 0.27048816680908205, "epoch": 1.0869565217391304, "frac_reward_zero_std": 0.35, "grad_norm": 0.9172391891479492, "learning_rate": 4.035619093772716e-07, "loss": 0.0013, "num_tokens": 125084081.0, "reward": 0.6635416746139526, "reward_std": 0.13708136826753617, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6635416746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.3219247102737427, "sampling/importance_sampling_ratio/max": 1.861747407913208, "sampling/importance_sampling_ratio/mean": 0.9998850226402283, "sampling/importance_sampling_ratio/min": 0.297413569688797, "sampling/sampling_logp_difference/max": 1.513976526260376, "sampling/sampling_logp_difference/mean": 0.012974279746413232, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1404.2, "completions/max_terminated_length": 1404.2, "completions/mean_length": 947.790625, "completions/mean_terminated_length": 947.790625, "completions/min_length": 576.0, "completions/min_terminated_length": 576.0, "entropy": 0.2738521546125412, "epoch": 1.0928319623971798, "frac_reward_zero_std": 0.35, "grad_norm": 0.8329576849937439, "learning_rate": 4.0295614247637507e-07, "loss": -0.0001, "num_tokens": 125721630.0, "reward": 0.7278645873069763, "reward_std": 0.13716669231653214, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7278645873069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.2882854163646698, "sampling/importance_sampling_ratio/max": 1.9897720098495484, "sampling/importance_sampling_ratio/mean": 1.0000043034553527, "sampling/importance_sampling_ratio/min": 0.3875389933586121, "sampling/sampling_logp_difference/max": 1.3065906524658204, "sampling/sampling_logp_difference/mean": 0.013173772767186166, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1520.4, "completions/max_terminated_length": 1520.4, "completions/mean_length": 953.040625, "completions/mean_terminated_length": 953.040625, "completions/min_length": 603.6, "completions/min_terminated_length": 603.6, "entropy": 0.2765673935413361, "epoch": 1.098707403055229, "frac_reward_zero_std": 0.1, "grad_norm": 0.8879562616348267, "learning_rate": 4.023503755754785e-07, "loss": -0.0082, "num_tokens": 126360139.0, "reward": 0.6124479234218597, "reward_std": 0.17750487923622132, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6124479234218597, "rewards/e2e_recall_precision_mixed_reward/std": 0.2882064312696457, "sampling/importance_sampling_ratio/max": 1.9080177783966064, "sampling/importance_sampling_ratio/mean": 0.9999000906944275, "sampling/importance_sampling_ratio/min": 0.3805552273988724, "sampling/sampling_logp_difference/max": 1.0973583936691285, "sampling/sampling_logp_difference/mean": 0.013284470327198506, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1582.0, "completions/max_terminated_length": 1582.0, "completions/mean_length": 971.50625, "completions/mean_terminated_length": 971.50625, "completions/min_length": 586.8, "completions/min_terminated_length": 586.8, "entropy": 0.3045738399028778, "epoch": 1.1045828437132785, "frac_reward_zero_std": 0.2, "grad_norm": 0.7119807600975037, "learning_rate": 4.01744608674582e-07, "loss": -0.0073, "num_tokens": 127001389.0, "reward": 0.7123437762260437, "reward_std": 0.17186959385871886, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7123437762260437, "rewards/e2e_recall_precision_mixed_reward/std": 0.3540113389492035, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000130653381347, "sampling/importance_sampling_ratio/min": 0.3947632074356079, "sampling/sampling_logp_difference/max": 1.1084512948989869, "sampling/sampling_logp_difference/mean": 0.014095421135425567, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1514.0, "completions/max_terminated_length": 1514.0, "completions/mean_length": 969.684375, "completions/mean_terminated_length": 969.684375, "completions/min_length": 561.0, "completions/min_terminated_length": 561.0, "entropy": 0.30098283290863037, "epoch": 1.1104582843713278, "frac_reward_zero_std": 0.4, "grad_norm": 0.8372572064399719, "learning_rate": 4.0113884177368543e-07, "loss": -0.004, "num_tokens": 127621528.0, "reward": 0.6710937559604645, "reward_std": 0.1159849688410759, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6710937559604645, "rewards/e2e_recall_precision_mixed_reward/std": 0.3547815322875977, "sampling/importance_sampling_ratio/max": 1.9597235918045044, "sampling/importance_sampling_ratio/mean": 0.9999991655349731, "sampling/importance_sampling_ratio/min": 0.4007263362407684, "sampling/sampling_logp_difference/max": 1.044080376625061, "sampling/sampling_logp_difference/mean": 0.014144887030124665, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1583.0, "completions/max_terminated_length": 1583.0, "completions/mean_length": 988.59375, "completions/mean_terminated_length": 988.59375, "completions/min_length": 660.8, "completions/min_terminated_length": 660.8, "entropy": 0.2859313428401947, "epoch": 1.1163337250293772, "frac_reward_zero_std": 0.3, "grad_norm": 0.7519243955612183, "learning_rate": 4.005330748727889e-07, "loss": 0.0136, "num_tokens": 128225958.0, "reward": 0.756458330154419, "reward_std": 0.13837233185768127, "rewards/e2e_recall_precision_mixed_reward/mean": 0.756458330154419, "rewards/e2e_recall_precision_mixed_reward/std": 0.24186617136001587, "sampling/importance_sampling_ratio/max": 1.8979056119918822, "sampling/importance_sampling_ratio/mean": 1.0001055955886842, "sampling/importance_sampling_ratio/min": 0.31551241781562567, "sampling/sampling_logp_difference/max": 2.015387845039368, "sampling/sampling_logp_difference/mean": 0.013428857550024986, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1616.8, "completions/max_terminated_length": 1616.8, "completions/mean_length": 1025.23125, "completions/mean_terminated_length": 1025.23125, "completions/min_length": 682.4, "completions/min_terminated_length": 682.4, "entropy": 0.30995495319366456, "epoch": 1.1222091656874265, "frac_reward_zero_std": 0.1, "grad_norm": 0.8149730563163757, "learning_rate": 3.999273079718924e-07, "loss": 0.0003, "num_tokens": 128900864.0, "reward": 0.6762500166893005, "reward_std": 0.1645580381155014, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6762500166893005, "rewards/e2e_recall_precision_mixed_reward/std": 0.25772719383239745, "sampling/importance_sampling_ratio/max": 1.9383854150772095, "sampling/importance_sampling_ratio/mean": 1.0000874400138855, "sampling/importance_sampling_ratio/min": 0.3173495039343834, "sampling/sampling_logp_difference/max": 1.5993350982666015, "sampling/sampling_logp_difference/mean": 0.013984563015401363, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1401.0, "completions/max_terminated_length": 1401.0, "completions/mean_length": 988.059375, "completions/mean_terminated_length": 988.059375, "completions/min_length": 628.2, "completions/min_terminated_length": 628.2, "entropy": 0.3168246328830719, "epoch": 1.128084606345476, "frac_reward_zero_std": 0.3, "grad_norm": 0.6208041906356812, "learning_rate": 3.9932154107099585e-07, "loss": 0.0121, "num_tokens": 129572115.0, "reward": 0.6992708444595337, "reward_std": 0.14730916619300843, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6992708444595337, "rewards/e2e_recall_precision_mixed_reward/std": 0.27717364132404326, "sampling/importance_sampling_ratio/max": 1.9295872926712037, "sampling/importance_sampling_ratio/mean": 1.0002163410186768, "sampling/importance_sampling_ratio/min": 0.39420167207717893, "sampling/sampling_logp_difference/max": 1.2145282983779908, "sampling/sampling_logp_difference/mean": 0.014649266377091408, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1293.8, "completions/max_terminated_length": 1293.8, "completions/mean_length": 939.94375, "completions/mean_terminated_length": 939.94375, "completions/min_length": 585.0, "completions/min_terminated_length": 585.0, "entropy": 0.28997875452041627, "epoch": 1.1339600470035252, "frac_reward_zero_std": 0.4, "grad_norm": 0.8977941274642944, "learning_rate": 3.9871577417009934e-07, "loss": -0.0042, "num_tokens": 130222945.0, "reward": 0.7091145992279053, "reward_std": 0.14130303710699083, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7091145992279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.31121143102645876, "sampling/importance_sampling_ratio/max": 1.9547757387161255, "sampling/importance_sampling_ratio/mean": 0.9999549508094787, "sampling/importance_sampling_ratio/min": 0.3619807779788971, "sampling/sampling_logp_difference/max": 1.2246970176696776, "sampling/sampling_logp_difference/mean": 0.013661802746355534, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1465.6, "completions/max_terminated_length": 1465.6, "completions/mean_length": 1035.078125, "completions/mean_terminated_length": 1035.078125, "completions/min_length": 594.8, "completions/min_terminated_length": 594.8, "entropy": 0.2883412778377533, "epoch": 1.1398354876615746, "frac_reward_zero_std": 0.3, "grad_norm": 0.8469650149345398, "learning_rate": 3.981100072692028e-07, "loss": -0.0027, "num_tokens": 130881018.0, "reward": 0.6606770992279053, "reward_std": 0.1474784180521965, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6606770992279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.36500520408153536, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000018060207367, "sampling/importance_sampling_ratio/min": 0.3527994304895401, "sampling/sampling_logp_difference/max": 1.2171527862548828, "sampling/sampling_logp_difference/mean": 0.013521765917539596, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1390.8, "completions/max_terminated_length": 1390.8, "completions/mean_length": 1007.9625, "completions/mean_terminated_length": 1007.9625, "completions/min_length": 686.8, "completions/min_terminated_length": 686.8, "entropy": 0.284423416852951, "epoch": 1.145710928319624, "frac_reward_zero_std": 0.45, "grad_norm": 0.6974943280220032, "learning_rate": 3.9750424036830626e-07, "loss": -0.0051, "num_tokens": 131522462.0, "reward": 0.568177092075348, "reward_std": 0.10882167518138885, "rewards/e2e_recall_precision_mixed_reward/mean": 0.568177092075348, "rewards/e2e_recall_precision_mixed_reward/std": 0.3816623717546463, "sampling/importance_sampling_ratio/max": 1.7701248168945312, "sampling/importance_sampling_ratio/mean": 0.9999536991119384, "sampling/importance_sampling_ratio/min": 0.45498186349868774, "sampling/sampling_logp_difference/max": 0.8206616401672363, "sampling/sampling_logp_difference/mean": 0.01323307417333126, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1590.4, "completions/max_terminated_length": 1590.4, "completions/mean_length": 1085.0625, "completions/mean_terminated_length": 1085.0625, "completions/min_length": 653.0, "completions/min_terminated_length": 653.0, "entropy": 0.3135793745517731, "epoch": 1.1515863689776733, "frac_reward_zero_std": 0.25, "grad_norm": 1.0624127388000488, "learning_rate": 3.9689847346740975e-07, "loss": -0.0149, "num_tokens": 132210226.0, "reward": 0.6392708539962768, "reward_std": 0.14114319533109665, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6392708539962768, "rewards/e2e_recall_precision_mixed_reward/std": 0.35320115089416504, "sampling/importance_sampling_ratio/max": 1.7919421195983887, "sampling/importance_sampling_ratio/mean": 0.9998708367347717, "sampling/importance_sampling_ratio/min": 0.30457684099674226, "sampling/sampling_logp_difference/max": 1.2323057889938354, "sampling/sampling_logp_difference/mean": 0.014576360583305359, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1604.6, "completions/max_terminated_length": 1604.6, "completions/mean_length": 1116.96875, "completions/mean_terminated_length": 1116.96875, "completions/min_length": 598.8, "completions/min_terminated_length": 598.8, "entropy": 0.32797098755836485, "epoch": 1.1574618096357228, "frac_reward_zero_std": 0.15, "grad_norm": 0.6008588671684265, "learning_rate": 3.962927065665132e-07, "loss": 0.0174, "num_tokens": 132912392.0, "reward": 0.6921354413032532, "reward_std": 0.18891476839780807, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6921354413032532, "rewards/e2e_recall_precision_mixed_reward/std": 0.2808209300041199, "sampling/importance_sampling_ratio/max": 1.99753577709198, "sampling/importance_sampling_ratio/mean": 0.9998470783233643, "sampling/importance_sampling_ratio/min": 0.323549946770072, "sampling/sampling_logp_difference/max": 1.5377471923828125, "sampling/sampling_logp_difference/mean": 0.015015862323343754, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1794.0, "completions/max_terminated_length": 1599.8, "completions/mean_length": 1081.353125, "completions/mean_terminated_length": 1076.6255126953124, "completions/min_length": 662.0, "completions/min_terminated_length": 662.0, "entropy": 0.31267160177230835, "epoch": 1.163337250293772, "frac_reward_zero_std": 0.35, "grad_norm": 0.7435998916625977, "learning_rate": 3.956869396656167e-07, "loss": -0.0173, "num_tokens": 133587877.0, "reward": 0.528125011920929, "reward_std": 0.13365225195884706, "rewards/e2e_recall_precision_mixed_reward/mean": 0.528125011920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.35800984501838684, "sampling/importance_sampling_ratio/max": 1.9870172500610352, "sampling/importance_sampling_ratio/mean": 0.9999597549438477, "sampling/importance_sampling_ratio/min": 0.4234741389751434, "sampling/sampling_logp_difference/max": 0.9320145964622497, "sampling/sampling_logp_difference/mean": 0.01449219174683094, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1483.0, "completions/max_terminated_length": 1483.0, "completions/mean_length": 1044.7875, "completions/mean_terminated_length": 1044.7875, "completions/min_length": 646.2, "completions/min_terminated_length": 646.2, "entropy": 0.31615627408027647, "epoch": 1.1692126909518215, "frac_reward_zero_std": 0.55, "grad_norm": 0.6654034852981567, "learning_rate": 3.950811727647201e-07, "loss": 0.0099, "num_tokens": 134243985.0, "reward": 0.8039062738418579, "reward_std": 0.09408158212900161, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8039062738418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.2852950572967529, "sampling/importance_sampling_ratio/max": 1.8471860408782959, "sampling/importance_sampling_ratio/mean": 0.9998784899711609, "sampling/importance_sampling_ratio/min": 0.3083785384893417, "sampling/sampling_logp_difference/max": 1.310741949081421, "sampling/sampling_logp_difference/mean": 0.014296729303896426, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1674.8, "completions/max_terminated_length": 1674.8, "completions/mean_length": 1010.9375, "completions/mean_terminated_length": 1010.9375, "completions/min_length": 551.2, "completions/min_terminated_length": 551.2, "entropy": 0.30919942259788513, "epoch": 1.1750881316098707, "frac_reward_zero_std": 0.45, "grad_norm": 0.797741174697876, "learning_rate": 3.944754058638236e-07, "loss": -0.0179, "num_tokens": 134896445.0, "reward": 0.6697916746139526, "reward_std": 0.14145222902297974, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6697916746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.3327239155769348, "sampling/importance_sampling_ratio/max": 1.9311280488967895, "sampling/importance_sampling_ratio/mean": 0.9998689293861389, "sampling/importance_sampling_ratio/min": 0.2995154604315758, "sampling/sampling_logp_difference/max": 1.3440932035446167, "sampling/sampling_logp_difference/mean": 0.014279096573591232, "step": 1000 }, { "epoch": 1.1750881316098707, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.000625, "eval_completions/max_length": 1595.28, "eval_completions/max_terminated_length": 1578.0, "eval_completions/mean_length": 1022.27875, "eval_completions/mean_terminated_length": 1021.36798828125, "eval_completions/min_length": 659.68, "eval_completions/min_terminated_length": 659.68, "eval_entropy": 0.3078763961791992, "eval_frac_reward_zero_std": 0.39, "eval_loss": 0.0021373536437749863, "eval_num_tokens": 134896445.0, "eval_reward": 0.6751354324817658, "eval_reward_std": 0.12967597171664239, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.6751354372501374, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.3238289541006088, "eval_runtime": 414.1064, "eval_samples_per_second": 0.241, "eval_sampling/importance_sampling_ratio/max": 1.892482771873474, "eval_sampling/importance_sampling_ratio/mean": 0.9999824357032776, "eval_sampling/importance_sampling_ratio/min": 0.366875017285347, "eval_sampling/sampling_logp_difference/max": 1.0593089628219605, "eval_sampling/sampling_logp_difference/mean": 0.01425721075385809, "eval_steps_per_second": 0.005, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1866.6, "completions/max_terminated_length": 1866.6, "completions/mean_length": 1096.41875, "completions/mean_terminated_length": 1096.41875, "completions/min_length": 782.2, "completions/min_terminated_length": 782.2, "entropy": 0.31382623314857483, "epoch": 1.1809635722679201, "frac_reward_zero_std": 0.2, "grad_norm": 0.8752436637878418, "learning_rate": 3.938696389629271e-07, "loss": 0.0053, "num_tokens": 135588419.0, "reward": 0.7271875023841858, "reward_std": 0.14706478863954545, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7271875023841858, "rewards/e2e_recall_precision_mixed_reward/std": 0.23389651179313659, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000099420547486, "sampling/importance_sampling_ratio/min": 0.2783097416162491, "sampling/sampling_logp_difference/max": 1.292723035812378, "sampling/sampling_logp_difference/mean": 0.014573409222066402, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1822.0, "completions/max_terminated_length": 1822.0, "completions/mean_length": 1118.215625, "completions/mean_terminated_length": 1118.215625, "completions/min_length": 695.8, "completions/min_terminated_length": 695.8, "entropy": 0.3018251657485962, "epoch": 1.1868390129259694, "frac_reward_zero_std": 0.3, "grad_norm": 0.8171986937522888, "learning_rate": 3.932638720620305e-07, "loss": -0.0098, "num_tokens": 136260632.0, "reward": 0.7615625262260437, "reward_std": 0.12762271910905837, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7615625262260437, "rewards/e2e_recall_precision_mixed_reward/std": 0.2775488555431366, "sampling/importance_sampling_ratio/max": 1.9084330320358276, "sampling/importance_sampling_ratio/mean": 0.9998735070228577, "sampling/importance_sampling_ratio/min": 0.37997121512889864, "sampling/sampling_logp_difference/max": 1.0514094591140748, "sampling/sampling_logp_difference/mean": 0.013783762976527215, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1724.4, "completions/max_terminated_length": 1610.0, "completions/mean_length": 1068.925, "completions/mean_terminated_length": 1064.7508544921875, "completions/min_length": 646.6, "completions/min_terminated_length": 646.6, "entropy": 0.31577218770980836, "epoch": 1.1927144535840188, "frac_reward_zero_std": 0.5, "grad_norm": 0.9301170706748962, "learning_rate": 3.9265810516113397e-07, "loss": -0.0118, "num_tokens": 136912540.0, "reward": 0.66307293176651, "reward_std": 0.1278460681438446, "rewards/e2e_recall_precision_mixed_reward/mean": 0.66307293176651, "rewards/e2e_recall_precision_mixed_reward/std": 0.3507560431957245, "sampling/importance_sampling_ratio/max": 1.98316330909729, "sampling/importance_sampling_ratio/mean": 0.999949038028717, "sampling/importance_sampling_ratio/min": 0.42532923221588137, "sampling/sampling_logp_difference/max": 0.8826199054718018, "sampling/sampling_logp_difference/mean": 0.014387455582618714, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1700.6, "completions/max_terminated_length": 1700.6, "completions/mean_length": 1082.975, "completions/mean_terminated_length": 1082.975, "completions/min_length": 704.0, "completions/min_terminated_length": 704.0, "entropy": 0.3057791173458099, "epoch": 1.198589894242068, "frac_reward_zero_std": 0.2, "grad_norm": 0.6882150173187256, "learning_rate": 3.920523382602374e-07, "loss": 0.0127, "num_tokens": 137576788.0, "reward": 0.6527604460716248, "reward_std": 0.15843599140644074, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6527604460716248, "rewards/e2e_recall_precision_mixed_reward/std": 0.35246109366416933, "sampling/importance_sampling_ratio/max": 1.9179187059402465, "sampling/importance_sampling_ratio/mean": 1.0000202059745789, "sampling/importance_sampling_ratio/min": 0.3111193537712097, "sampling/sampling_logp_difference/max": 1.2015019178390502, "sampling/sampling_logp_difference/mean": 0.014157050289213657, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1428.8, "completions/max_terminated_length": 1428.8, "completions/mean_length": 1032.625, "completions/mean_terminated_length": 1032.625, "completions/min_length": 639.8, "completions/min_terminated_length": 639.8, "entropy": 0.3173108518123627, "epoch": 1.2044653349001175, "frac_reward_zero_std": 0.4, "grad_norm": 0.774826169013977, "learning_rate": 3.914465713593409e-07, "loss": 0.0059, "num_tokens": 138243644.0, "reward": 0.7501562595367431, "reward_std": 0.12034987509250641, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7501562595367431, "rewards/e2e_recall_precision_mixed_reward/std": 0.27434692680835726, "sampling/importance_sampling_ratio/max": 1.977065873146057, "sampling/importance_sampling_ratio/mean": 1.0000038266181945, "sampling/importance_sampling_ratio/min": 0.40885123908519744, "sampling/sampling_logp_difference/max": 1.1413819074630738, "sampling/sampling_logp_difference/mean": 0.014192923717200757, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1571.8, "completions/max_terminated_length": 1571.8, "completions/mean_length": 1090.91875, "completions/mean_terminated_length": 1090.91875, "completions/min_length": 744.8, "completions/min_terminated_length": 744.8, "entropy": 0.32138744592666624, "epoch": 1.2103407755581668, "frac_reward_zero_std": 0.65, "grad_norm": 0.4858890771865845, "learning_rate": 3.908408044584444e-07, "loss": 0.0088, "num_tokens": 138915474.0, "reward": 0.7064583480358124, "reward_std": 0.06657353341579438, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7064583480358124, "rewards/e2e_recall_precision_mixed_reward/std": 0.30555517971515656, "sampling/importance_sampling_ratio/max": 1.9622070789337158, "sampling/importance_sampling_ratio/mean": 1.0000203251838684, "sampling/importance_sampling_ratio/min": 0.3662696361541748, "sampling/sampling_logp_difference/max": 1.2498675346374513, "sampling/sampling_logp_difference/mean": 0.014360511116683483, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1514.2, "completions/max_terminated_length": 1514.2, "completions/mean_length": 1031.846875, "completions/mean_terminated_length": 1031.846875, "completions/min_length": 668.6, "completions/min_terminated_length": 668.6, "entropy": 0.30652998089790345, "epoch": 1.2162162162162162, "frac_reward_zero_std": 0.5, "grad_norm": 0.7699494957923889, "learning_rate": 3.902350375575478e-07, "loss": 0.009, "num_tokens": 139571185.0, "reward": 0.7100000023841858, "reward_std": 0.0897395059466362, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7100000023841858, "rewards/e2e_recall_precision_mixed_reward/std": 0.3180919706821442, "sampling/importance_sampling_ratio/max": 1.8632827520370483, "sampling/importance_sampling_ratio/mean": 0.9999406576156616, "sampling/importance_sampling_ratio/min": 0.3360584322363138, "sampling/sampling_logp_difference/max": 1.4799968481063843, "sampling/sampling_logp_difference/mean": 0.014310248382389545, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1635.2, "completions/max_terminated_length": 1635.2, "completions/mean_length": 1085.465625, "completions/mean_terminated_length": 1085.465625, "completions/min_length": 661.8, "completions/min_terminated_length": 661.8, "entropy": 0.31891674995422364, "epoch": 1.2220916568742655, "frac_reward_zero_std": 0.2, "grad_norm": 0.8770401477813721, "learning_rate": 3.896292706566513e-07, "loss": -0.0065, "num_tokens": 140219510.0, "reward": 0.646302092075348, "reward_std": 0.18240397274494172, "rewards/e2e_recall_precision_mixed_reward/mean": 0.646302092075348, "rewards/e2e_recall_precision_mixed_reward/std": 0.3216747730970383, "sampling/importance_sampling_ratio/max": 1.9571067094802856, "sampling/importance_sampling_ratio/mean": 1.0000643253326416, "sampling/importance_sampling_ratio/min": 0.35792707204818724, "sampling/sampling_logp_difference/max": 1.0642118215560914, "sampling/sampling_logp_difference/mean": 0.014467264525592327, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1492.8, "completions/max_terminated_length": 1492.8, "completions/mean_length": 1060.609375, "completions/mean_terminated_length": 1060.609375, "completions/min_length": 768.6, "completions/min_terminated_length": 768.6, "entropy": 0.2915258467197418, "epoch": 1.227967097532315, "frac_reward_zero_std": 0.35, "grad_norm": 0.8293555378913879, "learning_rate": 3.8902350375575474e-07, "loss": 0.0039, "num_tokens": 140889625.0, "reward": 0.7548958539962769, "reward_std": 0.12820187769830227, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7548958539962769, "rewards/e2e_recall_precision_mixed_reward/std": 0.26064820885658263, "sampling/importance_sampling_ratio/max": 1.998341941833496, "sampling/importance_sampling_ratio/mean": 0.9999308705329895, "sampling/importance_sampling_ratio/min": 0.31071800738573074, "sampling/sampling_logp_difference/max": 1.4954757213592529, "sampling/sampling_logp_difference/mean": 0.013605404086411, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1621.8, "completions/max_terminated_length": 1621.8, "completions/mean_length": 971.846875, "completions/mean_terminated_length": 971.846875, "completions/min_length": 557.6, "completions/min_terminated_length": 557.6, "entropy": 0.29920910596847533, "epoch": 1.2338425381903644, "frac_reward_zero_std": 0.55, "grad_norm": 0.6538206338882446, "learning_rate": 3.8841773685485823e-07, "loss": 0.0112, "num_tokens": 141503960.0, "reward": 0.6366666674613952, "reward_std": 0.09315153658390045, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6366666793823242, "rewards/e2e_recall_precision_mixed_reward/std": 0.2838348388671875, "sampling/importance_sampling_ratio/max": 1.9519197940826416, "sampling/importance_sampling_ratio/mean": 1.000203275680542, "sampling/importance_sampling_ratio/min": 0.3838867276906967, "sampling/sampling_logp_difference/max": 1.0728678226470947, "sampling/sampling_logp_difference/mean": 0.014343824982643128, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1641.4, "completions/max_terminated_length": 1641.4, "completions/mean_length": 1104.896875, "completions/mean_terminated_length": 1104.896875, "completions/min_length": 646.0, "completions/min_terminated_length": 646.0, "entropy": 0.2859143793582916, "epoch": 1.2397179788484136, "frac_reward_zero_std": 0.5, "grad_norm": 0.4819332957267761, "learning_rate": 3.878119699539617e-07, "loss": 0.0001, "num_tokens": 142201559.0, "reward": 0.6817708432674408, "reward_std": 0.08722722977399826, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6817708432674408, "rewards/e2e_recall_precision_mixed_reward/std": 0.3009837418794632, "sampling/importance_sampling_ratio/max": 1.929477906227112, "sampling/importance_sampling_ratio/mean": 0.9999855399131775, "sampling/importance_sampling_ratio/min": 0.3711371779441833, "sampling/sampling_logp_difference/max": 1.088106060028076, "sampling/sampling_logp_difference/mean": 0.01334429495036602, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1336.4, "completions/max_terminated_length": 1336.4, "completions/mean_length": 954.615625, "completions/mean_terminated_length": 954.615625, "completions/min_length": 709.0, "completions/min_terminated_length": 709.0, "entropy": 0.2719797283411026, "epoch": 1.245593419506463, "frac_reward_zero_std": 0.3, "grad_norm": 0.7902669310569763, "learning_rate": 3.8720620305306516e-07, "loss": 0.0013, "num_tokens": 142837324.0, "reward": 0.7114583492279053, "reward_std": 0.12315647304058075, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7114583492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.28056967854499815, "sampling/importance_sampling_ratio/max": 1.9814757108688354, "sampling/importance_sampling_ratio/mean": 1.000010859966278, "sampling/importance_sampling_ratio/min": 0.33336481153965, "sampling/sampling_logp_difference/max": 1.292347240447998, "sampling/sampling_logp_difference/mean": 0.012967149168252945, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1552.4, "completions/max_terminated_length": 1552.4, "completions/mean_length": 1021.371875, "completions/mean_terminated_length": 1021.371875, "completions/min_length": 677.2, "completions/min_terminated_length": 677.2, "entropy": 0.2754159212112427, "epoch": 1.2514688601645123, "frac_reward_zero_std": 0.6, "grad_norm": 0.4861776530742645, "learning_rate": 3.8660043615216865e-07, "loss": -0.0063, "num_tokens": 143497939.0, "reward": 0.7752604246139526, "reward_std": 0.10692294090986251, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7752604246139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.2773577839136124, "sampling/importance_sampling_ratio/max": 1.9510645627975465, "sampling/importance_sampling_ratio/mean": 1.00001460313797, "sampling/importance_sampling_ratio/min": 0.3078484356403351, "sampling/sampling_logp_difference/max": 1.2806957244873047, "sampling/sampling_logp_difference/mean": 0.013292433321475982, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1440.6, "completions/max_terminated_length": 1440.6, "completions/mean_length": 998.9875, "completions/mean_terminated_length": 998.9875, "completions/min_length": 659.6, "completions/min_terminated_length": 659.6, "entropy": 0.2834341287612915, "epoch": 1.2573443008225618, "frac_reward_zero_std": 0.45, "grad_norm": 0.6322982907295227, "learning_rate": 3.859946692512721e-07, "loss": 0.0005, "num_tokens": 144123663.0, "reward": 0.8090625166893005, "reward_std": 0.11898693442344666, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8090625166893005, "rewards/e2e_recall_precision_mixed_reward/std": 0.23603131622076035, "sampling/importance_sampling_ratio/max": 1.8387038946151733, "sampling/importance_sampling_ratio/mean": 1.0000056982040406, "sampling/importance_sampling_ratio/min": 0.3637888193130493, "sampling/sampling_logp_difference/max": 1.0442706823349, "sampling/sampling_logp_difference/mean": 0.013553647883236408, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1840.4, "completions/max_terminated_length": 1650.4, "completions/mean_length": 1059.684375, "completions/mean_terminated_length": 1055.2914306640625, "completions/min_length": 690.4, "completions/min_terminated_length": 690.4, "entropy": 0.29739258885383607, "epoch": 1.263219741480611, "frac_reward_zero_std": 0.35, "grad_norm": 0.9271990656852722, "learning_rate": 3.853889023503756e-07, "loss": -0.0163, "num_tokens": 144791014.0, "reward": 0.6409895896911622, "reward_std": 0.13399946838617324, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6409896016120911, "rewards/e2e_recall_precision_mixed_reward/std": 0.3103723287582397, "sampling/importance_sampling_ratio/max": 1.9143768310546876, "sampling/importance_sampling_ratio/mean": 1.000032413005829, "sampling/importance_sampling_ratio/min": 0.37732569575309755, "sampling/sampling_logp_difference/max": 1.162096655368805, "sampling/sampling_logp_difference/mean": 0.013873641937971115, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1678.8, "completions/max_terminated_length": 1678.8, "completions/mean_length": 1075.20625, "completions/mean_terminated_length": 1075.20625, "completions/min_length": 751.6, "completions/min_terminated_length": 751.6, "entropy": 0.28603232502937315, "epoch": 1.2690951821386605, "frac_reward_zero_std": 0.45, "grad_norm": 0.3865974247455597, "learning_rate": 3.8478313544947906e-07, "loss": -0.0016, "num_tokens": 145442248.0, "reward": 0.716979193687439, "reward_std": 0.1147857926785946, "rewards/e2e_recall_precision_mixed_reward/mean": 0.716979193687439, "rewards/e2e_recall_precision_mixed_reward/std": 0.35159227848052976, "sampling/importance_sampling_ratio/max": 1.8565044403076172, "sampling/importance_sampling_ratio/mean": 0.9999043822288514, "sampling/importance_sampling_ratio/min": 0.2803425773978233, "sampling/sampling_logp_difference/max": 1.4406961679458619, "sampling/sampling_logp_difference/mean": 0.013564172014594079, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1652.2, "completions/max_terminated_length": 1652.2, "completions/mean_length": 1100.740625, "completions/mean_terminated_length": 1100.740625, "completions/min_length": 779.6, "completions/min_terminated_length": 779.6, "entropy": 0.2707725286483765, "epoch": 1.2749706227967097, "frac_reward_zero_std": 0.55, "grad_norm": 0.7248564958572388, "learning_rate": 3.841773685485825e-07, "loss": 0.0048, "num_tokens": 146117157.0, "reward": 0.7029687643051148, "reward_std": 0.10289829894900322, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7029687643051148, "rewards/e2e_recall_precision_mixed_reward/std": 0.30764630138874055, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999703764915466, "sampling/importance_sampling_ratio/min": 0.19010883904993534, "sampling/sampling_logp_difference/max": 1.9769764423370362, "sampling/sampling_logp_difference/mean": 0.013046731427311897, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1497.0, "completions/max_terminated_length": 1497.0, "completions/mean_length": 1038.95, "completions/mean_terminated_length": 1038.95, "completions/min_length": 711.4, "completions/min_terminated_length": 711.4, "entropy": 0.2841998040676117, "epoch": 1.2808460634547592, "frac_reward_zero_std": 0.5, "grad_norm": 0.4237470030784607, "learning_rate": 3.8357160164768594e-07, "loss": 0.0057, "num_tokens": 146740245.0, "reward": 0.784166669845581, "reward_std": 0.10403337031602859, "rewards/e2e_recall_precision_mixed_reward/mean": 0.784166669845581, "rewards/e2e_recall_precision_mixed_reward/std": 0.27793429493904115, "sampling/importance_sampling_ratio/max": 1.942676877975464, "sampling/importance_sampling_ratio/mean": 0.9999974012374878, "sampling/importance_sampling_ratio/min": 0.368558007478714, "sampling/sampling_logp_difference/max": 1.1511848449707032, "sampling/sampling_logp_difference/mean": 0.013521903567016125, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1358.6, "completions/max_terminated_length": 1358.6, "completions/mean_length": 1007.3875, "completions/mean_terminated_length": 1007.3875, "completions/min_length": 683.4, "completions/min_terminated_length": 683.4, "entropy": 0.2815568208694458, "epoch": 1.2867215041128084, "frac_reward_zero_std": 0.4, "grad_norm": 0.7796571254730225, "learning_rate": 3.829658347467894e-07, "loss": 0.003, "num_tokens": 147365153.0, "reward": 0.7693750143051148, "reward_std": 0.11561888605356216, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7693750143051148, "rewards/e2e_recall_precision_mixed_reward/std": 0.2604265660047531, "sampling/importance_sampling_ratio/max": 1.8576125860214234, "sampling/importance_sampling_ratio/mean": 0.9999807476997375, "sampling/importance_sampling_ratio/min": 0.3188953049480915, "sampling/sampling_logp_difference/max": 1.65875186920166, "sampling/sampling_logp_difference/mean": 0.013848881609737873, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1706.8, "completions/max_terminated_length": 1706.8, "completions/mean_length": 1156.165625, "completions/mean_terminated_length": 1156.165625, "completions/min_length": 800.0, "completions/min_terminated_length": 800.0, "entropy": 0.29959659576416015, "epoch": 1.2925969447708578, "frac_reward_zero_std": 0.5, "grad_norm": 0.7763993740081787, "learning_rate": 3.8236006784589286e-07, "loss": 0.0163, "num_tokens": 148050518.0, "reward": 0.6523437619209289, "reward_std": 0.08477627635002136, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6523437619209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.3252843528985977, "sampling/importance_sampling_ratio/max": 1.8790109157562256, "sampling/importance_sampling_ratio/mean": 0.9999762296676635, "sampling/importance_sampling_ratio/min": 0.27901336550712585, "sampling/sampling_logp_difference/max": 1.3847559690475464, "sampling/sampling_logp_difference/mean": 0.01403086856007576, "step": 1100 }, { "epoch": 1.2925969447708578, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1490.96, "eval_completions/max_terminated_length": 1490.96, "eval_completions/mean_length": 990.515, "eval_completions/mean_terminated_length": 990.515, "eval_completions/min_length": 692.08, "eval_completions/min_terminated_length": 692.08, "eval_entropy": 0.27925810635089876, "eval_frac_reward_zero_std": 0.47, "eval_loss": 0.005843394435942173, "eval_num_tokens": 148050518.0, "eval_reward": 0.6923645973205567, "eval_reward_std": 0.10961450830101967, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.6923645985126495, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.3154654276371002, "eval_runtime": 394.8883, "eval_samples_per_second": 0.253, "eval_sampling/importance_sampling_ratio/max": 1.9187990188598634, "eval_sampling/importance_sampling_ratio/mean": 0.9999549126625061, "eval_sampling/importance_sampling_ratio/min": 0.33063108295202254, "eval_sampling/sampling_logp_difference/max": 1.217089729309082, "eval_sampling/sampling_logp_difference/mean": 0.013591401651501656, "eval_steps_per_second": 0.005, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1543.4, "completions/max_terminated_length": 1524.4, "completions/mean_length": 1014.178125, "completions/mean_terminated_length": 995.0234619140625, "completions/min_length": 694.2, "completions/min_terminated_length": 694.2, "entropy": 0.26261157989501954, "epoch": 1.2984723854289073, "frac_reward_zero_std": 0.3, "grad_norm": 0.5455294847488403, "learning_rate": 3.8175430094499635e-07, "loss": -0.0131, "num_tokens": 148691003.0, "reward": 0.6966146051883697, "reward_std": 0.15783809274435043, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6966146051883697, "rewards/e2e_recall_precision_mixed_reward/std": 0.34295274019241334, "sampling/importance_sampling_ratio/max": 1.9716975450515748, "sampling/importance_sampling_ratio/mean": 1.00016188621521, "sampling/importance_sampling_ratio/min": 0.3574655741453171, "sampling/sampling_logp_difference/max": 1.1117752075195313, "sampling/sampling_logp_difference/mean": 0.013009194284677505, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1565.6, "completions/max_terminated_length": 1565.6, "completions/mean_length": 1094.09375, "completions/mean_terminated_length": 1094.09375, "completions/min_length": 731.4, "completions/min_terminated_length": 731.4, "entropy": 0.2692096889019012, "epoch": 1.3043478260869565, "frac_reward_zero_std": 0.45, "grad_norm": 0.0, "learning_rate": 3.811485340440998e-07, "loss": 0.0113, "num_tokens": 149393113.0, "reward": 0.6494791746139527, "reward_std": 0.1348447620868683, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6494791746139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.3158786088228226, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000579476356506, "sampling/importance_sampling_ratio/min": 0.29540793895721434, "sampling/sampling_logp_difference/max": 1.2994348287582398, "sampling/sampling_logp_difference/mean": 0.013329188153147698, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1392.6, "completions/max_terminated_length": 1392.6, "completions/mean_length": 1017.58125, "completions/mean_terminated_length": 1017.58125, "completions/min_length": 767.0, "completions/min_terminated_length": 767.0, "entropy": 0.2770149827003479, "epoch": 1.3102232667450058, "frac_reward_zero_std": 0.45, "grad_norm": 0.7833777070045471, "learning_rate": 3.805427671432033e-07, "loss": 0.0032, "num_tokens": 150030707.0, "reward": 0.6882812678813934, "reward_std": 0.11308581605553628, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6882812678813934, "rewards/e2e_recall_precision_mixed_reward/std": 0.29742818176746366, "sampling/importance_sampling_ratio/max": 1.935225486755371, "sampling/importance_sampling_ratio/mean": 1.000083565711975, "sampling/importance_sampling_ratio/min": 0.45306941866874695, "sampling/sampling_logp_difference/max": 0.8295891046524048, "sampling/sampling_logp_difference/mean": 0.013369818776845932, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1730.0, "completions/max_terminated_length": 1730.0, "completions/mean_length": 1053.4125, "completions/mean_terminated_length": 1053.4125, "completions/min_length": 710.2, "completions/min_terminated_length": 710.2, "entropy": 0.2534839272499084, "epoch": 1.3160987074030552, "frac_reward_zero_std": 0.25, "grad_norm": 0.5904003977775574, "learning_rate": 3.799370002423067e-07, "loss": 0.0026, "num_tokens": 150675895.0, "reward": 0.6664583504199981, "reward_std": 0.11373435258865357, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6664583504199981, "rewards/e2e_recall_precision_mixed_reward/std": 0.2492772251367569, "sampling/importance_sampling_ratio/max": 1.8575801372528076, "sampling/importance_sampling_ratio/mean": 0.9999268770217895, "sampling/importance_sampling_ratio/min": 0.385788106918335, "sampling/sampling_logp_difference/max": 1.204633069038391, "sampling/sampling_logp_difference/mean": 0.012645184434950352, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1442.8, "completions/max_terminated_length": 1442.8, "completions/mean_length": 1003.15, "completions/mean_terminated_length": 1003.15, "completions/min_length": 749.6, "completions/min_terminated_length": 749.6, "entropy": 0.2703131794929504, "epoch": 1.3219741480611047, "frac_reward_zero_std": 0.25, "grad_norm": 0.7168449759483337, "learning_rate": 3.793312333414102e-07, "loss": -0.0106, "num_tokens": 151362823.0, "reward": 0.6812500119209289, "reward_std": 0.1785949647426605, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6812500119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.29773281812667846, "sampling/importance_sampling_ratio/max": 1.8423972368240356, "sampling/importance_sampling_ratio/mean": 0.9999767899513244, "sampling/importance_sampling_ratio/min": 0.2740974217653275, "sampling/sampling_logp_difference/max": 1.5454424619674683, "sampling/sampling_logp_difference/mean": 0.01345563717186451, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1378.8, "completions/max_terminated_length": 1378.8, "completions/mean_length": 955.834375, "completions/mean_terminated_length": 955.834375, "completions/min_length": 688.2, "completions/min_terminated_length": 688.2, "entropy": 0.26716753244400027, "epoch": 1.327849588719154, "frac_reward_zero_std": 0.4, "grad_norm": 0.6475690007209778, "learning_rate": 3.787254664405137e-07, "loss": 0.0029, "num_tokens": 151966018.0, "reward": 0.733593761920929, "reward_std": 0.13283937126398088, "rewards/e2e_recall_precision_mixed_reward/mean": 0.733593761920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.2886820375919342, "sampling/importance_sampling_ratio/max": 1.9504351377487184, "sampling/importance_sampling_ratio/mean": 0.9999610781669617, "sampling/importance_sampling_ratio/min": 0.2764181695878506, "sampling/sampling_logp_difference/max": 1.8616169929504394, "sampling/sampling_logp_difference/mean": 0.013580117933452129, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1355.8, "completions/max_terminated_length": 1355.8, "completions/mean_length": 989.5875, "completions/mean_terminated_length": 989.5875, "completions/min_length": 734.8, "completions/min_terminated_length": 734.8, "entropy": 0.2569707274436951, "epoch": 1.3337250293772032, "frac_reward_zero_std": 0.15, "grad_norm": 0.862073540687561, "learning_rate": 3.7811969953961713e-07, "loss": -0.0004, "num_tokens": 152581582.0, "reward": 0.6937500238418579, "reward_std": 0.1769823968410492, "rewards/e2e_recall_precision_mixed_reward/mean": 0.693750011920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.30896676778793336, "sampling/importance_sampling_ratio/max": 1.9276174783706665, "sampling/importance_sampling_ratio/mean": 1.0000139355659485, "sampling/importance_sampling_ratio/min": 0.46765230894088744, "sampling/sampling_logp_difference/max": 1.0491119384765626, "sampling/sampling_logp_difference/mean": 0.013130680657923222, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1493.6, "completions/max_terminated_length": 1493.6, "completions/mean_length": 1019.05625, "completions/mean_terminated_length": 1019.05625, "completions/min_length": 723.0, "completions/min_terminated_length": 723.0, "entropy": 0.27715269327163694, "epoch": 1.3396004700352526, "frac_reward_zero_std": 0.45, "grad_norm": 0.7062737345695496, "learning_rate": 3.775139326387206e-07, "loss": -0.001, "num_tokens": 153221584.0, "reward": 0.6836979329586029, "reward_std": 0.11981369033455849, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6836979329586029, "rewards/e2e_recall_precision_mixed_reward/std": 0.2908316344022751, "sampling/importance_sampling_ratio/max": 1.9820157051086427, "sampling/importance_sampling_ratio/mean": 1.0000499367713929, "sampling/importance_sampling_ratio/min": 0.2672798324376345, "sampling/sampling_logp_difference/max": 1.7221438407897949, "sampling/sampling_logp_difference/mean": 0.013661108911037445, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1410.2, "completions/max_terminated_length": 1410.2, "completions/mean_length": 1047.36875, "completions/mean_terminated_length": 1047.36875, "completions/min_length": 694.4, "completions/min_terminated_length": 694.4, "entropy": 0.28525510430336, "epoch": 1.345475910693302, "frac_reward_zero_std": 0.45, "grad_norm": 0.768993079662323, "learning_rate": 3.7690816573782406e-07, "loss": 0.0026, "num_tokens": 153870806.0, "reward": 0.64041668176651, "reward_std": 0.11628761440515518, "rewards/e2e_recall_precision_mixed_reward/mean": 0.64041668176651, "rewards/e2e_recall_precision_mixed_reward/std": 0.2930860996246338, "sampling/importance_sampling_ratio/max": 1.9370128154754638, "sampling/importance_sampling_ratio/mean": 0.999804961681366, "sampling/importance_sampling_ratio/min": 0.31624155938625337, "sampling/sampling_logp_difference/max": 1.2107308864593507, "sampling/sampling_logp_difference/mean": 0.013913381099700927, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1712.8, "completions/max_terminated_length": 1712.8, "completions/mean_length": 1118.059375, "completions/mean_terminated_length": 1118.059375, "completions/min_length": 778.2, "completions/min_terminated_length": 778.2, "entropy": 0.2776745676994324, "epoch": 1.3513513513513513, "frac_reward_zero_std": 0.5, "grad_norm": 0.7642037272453308, "learning_rate": 3.7630239883692754e-07, "loss": -0.0087, "num_tokens": 154553769.0, "reward": 0.7213541746139527, "reward_std": 0.10083994418382644, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7213541746139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.31691438853740694, "sampling/importance_sampling_ratio/max": 1.981196355819702, "sampling/importance_sampling_ratio/mean": 1.0001033902168275, "sampling/importance_sampling_ratio/min": 0.2930480852723122, "sampling/sampling_logp_difference/max": 1.4658790826797485, "sampling/sampling_logp_difference/mean": 0.01348379347473383, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1759.0, "completions/max_terminated_length": 1759.0, "completions/mean_length": 1095.1125, "completions/mean_terminated_length": 1095.1125, "completions/min_length": 747.2, "completions/min_terminated_length": 747.2, "entropy": 0.2642821192741394, "epoch": 1.3572267920094008, "frac_reward_zero_std": 0.5, "grad_norm": 0.6376672983169556, "learning_rate": 3.7569663193603103e-07, "loss": 0.0022, "num_tokens": 155225645.0, "reward": 0.7658854246139526, "reward_std": 0.10820303261280059, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7658854246139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.2597797363996506, "sampling/importance_sampling_ratio/max": 1.948975110054016, "sampling/importance_sampling_ratio/mean": 1.000026035308838, "sampling/importance_sampling_ratio/min": 0.35516688525676726, "sampling/sampling_logp_difference/max": 1.1820269107818604, "sampling/sampling_logp_difference/mean": 0.013134175911545753, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1399.0, "completions/max_terminated_length": 1399.0, "completions/mean_length": 1009.390625, "completions/mean_terminated_length": 1009.390625, "completions/min_length": 767.4, "completions/min_terminated_length": 767.4, "entropy": 0.2690662145614624, "epoch": 1.36310223266745, "frac_reward_zero_std": 0.35, "grad_norm": 0.7229093909263611, "learning_rate": 3.7509086503513447e-07, "loss": 0.0111, "num_tokens": 155873482.0, "reward": 0.7645833373069764, "reward_std": 0.1397414982318878, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7645833373069764, "rewards/e2e_recall_precision_mixed_reward/std": 0.2538767337799072, "sampling/importance_sampling_ratio/max": 1.9541364669799806, "sampling/importance_sampling_ratio/mean": 1.0000741958618165, "sampling/importance_sampling_ratio/min": 0.3407637387514114, "sampling/sampling_logp_difference/max": 1.1103405952453613, "sampling/sampling_logp_difference/mean": 0.013483352214097976, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1418.8, "completions/max_terminated_length": 1418.8, "completions/mean_length": 1008.209375, "completions/mean_terminated_length": 1008.209375, "completions/min_length": 729.0, "completions/min_terminated_length": 729.0, "entropy": 0.2757707953453064, "epoch": 1.3689776733254995, "frac_reward_zero_std": 0.3, "grad_norm": 0.7225468754768372, "learning_rate": 3.7448509813423796e-07, "loss": 0.0025, "num_tokens": 156518173.0, "reward": 0.6832291841506958, "reward_std": 0.13243707567453383, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6832291841506958, "rewards/e2e_recall_precision_mixed_reward/std": 0.34315310418605804, "sampling/importance_sampling_ratio/max": 1.9023520231246949, "sampling/importance_sampling_ratio/mean": 1.0001229047775269, "sampling/importance_sampling_ratio/min": 0.3911436438560486, "sampling/sampling_logp_difference/max": 1.1172781705856323, "sampling/sampling_logp_difference/mean": 0.013606655411422253, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1205.0, "completions/max_terminated_length": 1205.0, "completions/mean_length": 889.803125, "completions/mean_terminated_length": 889.803125, "completions/min_length": 617.6, "completions/min_terminated_length": 617.6, "entropy": 0.2621892154216766, "epoch": 1.3748531139835487, "frac_reward_zero_std": 0.4, "grad_norm": 0.5152686834335327, "learning_rate": 3.7387933123334134e-07, "loss": -0.0004, "num_tokens": 157141166.0, "reward": 0.7700520992279053, "reward_std": 0.12003648579120636, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7700521111488342, "rewards/e2e_recall_precision_mixed_reward/std": 0.3090189278125763, "sampling/importance_sampling_ratio/max": 1.9857268810272217, "sampling/importance_sampling_ratio/mean": 1.0000556349754333, "sampling/importance_sampling_ratio/min": 0.3500809669494629, "sampling/sampling_logp_difference/max": 1.161958146095276, "sampling/sampling_logp_difference/mean": 0.013260713964700698, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1336.8, "completions/max_terminated_length": 1336.8, "completions/mean_length": 970.371875, "completions/mean_terminated_length": 970.371875, "completions/min_length": 635.2, "completions/min_terminated_length": 635.2, "entropy": 0.2712902396917343, "epoch": 1.3807285546415982, "frac_reward_zero_std": 0.4, "grad_norm": 0.7717751860618591, "learning_rate": 3.7327356433244483e-07, "loss": 0.0001, "num_tokens": 157777893.0, "reward": 0.7880208492279053, "reward_std": 0.1345105454325676, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7880208492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.23644578456878662, "sampling/importance_sampling_ratio/max": 1.9879061222076415, "sampling/importance_sampling_ratio/mean": 1.0000129222869873, "sampling/importance_sampling_ratio/min": 0.4100755751132965, "sampling/sampling_logp_difference/max": 1.0043761134147644, "sampling/sampling_logp_difference/mean": 0.013120016269385814, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1377.4, "completions/max_terminated_length": 1377.4, "completions/mean_length": 953.33125, "completions/mean_terminated_length": 953.33125, "completions/min_length": 671.4, "completions/min_terminated_length": 671.4, "entropy": 0.2518512338399887, "epoch": 1.3866039952996474, "frac_reward_zero_std": 0.45, "grad_norm": 0.708027184009552, "learning_rate": 3.726677974315483e-07, "loss": 0.0097, "num_tokens": 158411423.0, "reward": 0.7781770944595336, "reward_std": 0.10090606659650803, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7781770944595336, "rewards/e2e_recall_precision_mixed_reward/std": 0.2655190169811249, "sampling/importance_sampling_ratio/max": 1.914062237739563, "sampling/importance_sampling_ratio/mean": 0.9999913334846496, "sampling/importance_sampling_ratio/min": 0.4159839451313019, "sampling/sampling_logp_difference/max": 0.9063684225082398, "sampling/sampling_logp_difference/mean": 0.012738440930843354, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1215.2, "completions/max_terminated_length": 1215.2, "completions/mean_length": 938.703125, "completions/mean_terminated_length": 938.703125, "completions/min_length": 698.2, "completions/min_terminated_length": 698.2, "entropy": 0.2818134605884552, "epoch": 1.3924794359576969, "frac_reward_zero_std": 0.45, "grad_norm": 0.6890040040016174, "learning_rate": 3.7206203053065176e-07, "loss": 0.0017, "num_tokens": 159030960.0, "reward": 0.675677090883255, "reward_std": 0.12252334356307984, "rewards/e2e_recall_precision_mixed_reward/mean": 0.675677090883255, "rewards/e2e_recall_precision_mixed_reward/std": 0.30053952932357786, "sampling/importance_sampling_ratio/max": 1.8457768201828002, "sampling/importance_sampling_ratio/mean": 1.0000203847885132, "sampling/importance_sampling_ratio/min": 0.5052196741104126, "sampling/sampling_logp_difference/max": 0.7195221900939941, "sampling/sampling_logp_difference/mean": 0.013677260465919971, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1416.8, "completions/max_terminated_length": 1416.8, "completions/mean_length": 1010.390625, "completions/mean_terminated_length": 1010.390625, "completions/min_length": 713.4, "completions/min_terminated_length": 713.4, "entropy": 0.2787392377853394, "epoch": 1.398354876615746, "frac_reward_zero_std": 0.55, "grad_norm": 0.38302043080329895, "learning_rate": 3.7145626362975525e-07, "loss": 0.0009, "num_tokens": 159661149.0, "reward": 0.682812511920929, "reward_std": 0.09650907553732395, "rewards/e2e_recall_precision_mixed_reward/mean": 0.682812511920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.2945326387882233, "sampling/importance_sampling_ratio/max": 1.927133321762085, "sampling/importance_sampling_ratio/mean": 0.9999685287475586, "sampling/importance_sampling_ratio/min": 0.33883661329746245, "sampling/sampling_logp_difference/max": 1.2834124326705934, "sampling/sampling_logp_difference/mean": 0.01364175509661436, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1491.2, "completions/max_terminated_length": 1491.2, "completions/mean_length": 1039.9375, "completions/mean_terminated_length": 1039.9375, "completions/min_length": 756.6, "completions/min_terminated_length": 756.6, "entropy": 0.2633058696985245, "epoch": 1.4042303172737955, "frac_reward_zero_std": 0.4, "grad_norm": 0.6820769906044006, "learning_rate": 3.708504967288587e-07, "loss": 0.0059, "num_tokens": 160330617.0, "reward": 0.789843761920929, "reward_std": 0.12763621509075165, "rewards/e2e_recall_precision_mixed_reward/mean": 0.789843761920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.26849266290664675, "sampling/importance_sampling_ratio/max": 1.9752416849136352, "sampling/importance_sampling_ratio/mean": 1.0000136494636536, "sampling/importance_sampling_ratio/min": 0.40743643045425415, "sampling/sampling_logp_difference/max": 1.0446730375289917, "sampling/sampling_logp_difference/mean": 0.012942253239452839, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1338.0, "completions/max_terminated_length": 1338.0, "completions/mean_length": 932.365625, "completions/mean_terminated_length": 932.365625, "completions/min_length": 648.6, "completions/min_terminated_length": 648.6, "entropy": 0.2665301501750946, "epoch": 1.410105757931845, "frac_reward_zero_std": 0.5, "grad_norm": 0.6627605557441711, "learning_rate": 3.7024472982796217e-07, "loss": 0.0013, "num_tokens": 160940990.0, "reward": 0.6481250047683715, "reward_std": 0.10753663703799247, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6481250047683715, "rewards/e2e_recall_precision_mixed_reward/std": 0.36586096286773684, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000364661216736, "sampling/importance_sampling_ratio/min": 0.38219852447509767, "sampling/sampling_logp_difference/max": 1.015994167327881, "sampling/sampling_logp_difference/mean": 0.013067251071333885, "step": 1200 }, { "epoch": 1.410105757931845, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1236.44, "eval_completions/max_terminated_length": 1236.44, "eval_completions/mean_length": 903.0825, "eval_completions/mean_terminated_length": 903.0825, "eval_completions/min_length": 680.32, "eval_completions/min_terminated_length": 680.32, "eval_entropy": 0.2593661844730377, "eval_frac_reward_zero_std": 0.48, "eval_loss": 0.0016279831761494279, "eval_num_tokens": 160940990.0, "eval_reward": 0.6931771004199981, "eval_reward_std": 0.10766801729798317, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.6931771016120911, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.31816164910793304, "eval_runtime": 334.8567, "eval_samples_per_second": 0.299, "eval_sampling/importance_sampling_ratio/max": 1.915867462158203, "eval_sampling/importance_sampling_ratio/mean": 0.9999779844284058, "eval_sampling/importance_sampling_ratio/min": 0.39130869776010513, "eval_sampling/sampling_logp_difference/max": 1.063571047782898, "eval_sampling/sampling_logp_difference/mean": 0.012970775477588176, "eval_steps_per_second": 0.006, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1300.2, "completions/max_terminated_length": 1300.2, "completions/mean_length": 934.4875, "completions/mean_terminated_length": 934.4875, "completions/min_length": 667.2, "completions/min_terminated_length": 667.2, "entropy": 0.2741489470005035, "epoch": 1.4159811985898942, "frac_reward_zero_std": 0.4, "grad_norm": 0.8523896336555481, "learning_rate": 3.6963896292706566e-07, "loss": 0.0045, "num_tokens": 161607738.0, "reward": 0.6617187678813934, "reward_std": 0.11894842982292175, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6617187798023224, "rewards/e2e_recall_precision_mixed_reward/std": 0.32751025855541227, "sampling/importance_sampling_ratio/max": 1.9774553775787354, "sampling/importance_sampling_ratio/mean": 1.0000733613967896, "sampling/importance_sampling_ratio/min": 0.2799623891711235, "sampling/sampling_logp_difference/max": 1.5475644588470459, "sampling/sampling_logp_difference/mean": 0.013712556660175323, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1443.0, "completions/max_terminated_length": 1443.0, "completions/mean_length": 975.809375, "completions/mean_terminated_length": 975.809375, "completions/min_length": 724.2, "completions/min_terminated_length": 724.2, "entropy": 0.26065073907375336, "epoch": 1.4218566392479435, "frac_reward_zero_std": 0.3, "grad_norm": 0.6366615891456604, "learning_rate": 3.690331960261691e-07, "loss": -0.0025, "num_tokens": 162238429.0, "reward": 0.6471354246139527, "reward_std": 0.1409156620502472, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6471354365348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.2605146482586861, "sampling/importance_sampling_ratio/max": 1.9828581094741822, "sampling/importance_sampling_ratio/mean": 0.9999633073806763, "sampling/importance_sampling_ratio/min": 0.3066392242908478, "sampling/sampling_logp_difference/max": 1.2085018634796143, "sampling/sampling_logp_difference/mean": 0.013046330399811268, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1418.6, "completions/max_terminated_length": 1418.6, "completions/mean_length": 965.3125, "completions/mean_terminated_length": 965.3125, "completions/min_length": 718.8, "completions/min_terminated_length": 718.8, "entropy": 0.2614882171154022, "epoch": 1.427732079905993, "frac_reward_zero_std": 0.5, "grad_norm": 0.6511854529380798, "learning_rate": 3.684274291252726e-07, "loss": 0.0002, "num_tokens": 162839185.0, "reward": 0.7278646111488343, "reward_std": 0.08626341633498669, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7278646111488343, "rewards/e2e_recall_precision_mixed_reward/std": 0.3056270956993103, "sampling/importance_sampling_ratio/max": 1.926198172569275, "sampling/importance_sampling_ratio/mean": 0.9999306440353394, "sampling/importance_sampling_ratio/min": 0.3832464128732681, "sampling/sampling_logp_difference/max": 1.1059164881706238, "sampling/sampling_logp_difference/mean": 0.012989461980760097, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1518.6, "completions/max_terminated_length": 1325.4, "completions/mean_length": 996.95, "completions/mean_terminated_length": 992.1390991210938, "completions/min_length": 739.6, "completions/min_terminated_length": 739.6, "entropy": 0.2724646270275116, "epoch": 1.4336075205640424, "frac_reward_zero_std": 0.5, "grad_norm": 0.3615824580192566, "learning_rate": 3.67821662224376e-07, "loss": -0.008, "num_tokens": 163491181.0, "reward": 0.73125, "reward_std": 0.09822167605161666, "rewards/e2e_recall_precision_mixed_reward/mean": 0.73125, "rewards/e2e_recall_precision_mixed_reward/std": 0.2837597757577896, "sampling/importance_sampling_ratio/max": 1.8505266666412354, "sampling/importance_sampling_ratio/mean": 0.9999452590942383, "sampling/importance_sampling_ratio/min": 0.4435959100723267, "sampling/sampling_logp_difference/max": 1.5051439166069032, "sampling/sampling_logp_difference/mean": 0.013260528817772866, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1521.0, "completions/max_terminated_length": 1521.0, "completions/mean_length": 1007.390625, "completions/mean_terminated_length": 1007.390625, "completions/min_length": 767.8, "completions/min_terminated_length": 767.8, "entropy": 0.26534418761730194, "epoch": 1.4394829612220916, "frac_reward_zero_std": 0.55, "grad_norm": 0.0, "learning_rate": 3.672158953234795e-07, "loss": 0.0022, "num_tokens": 164151914.0, "reward": 0.7981250166893006, "reward_std": 0.1127402737736702, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7981250166893006, "rewards/e2e_recall_precision_mixed_reward/std": 0.2560457527637482, "sampling/importance_sampling_ratio/max": 1.9135950803756714, "sampling/importance_sampling_ratio/mean": 0.999986755847931, "sampling/importance_sampling_ratio/min": 0.303699953854084, "sampling/sampling_logp_difference/max": 1.3842716932296752, "sampling/sampling_logp_difference/mean": 0.013141075521707535, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1462.2, "completions/max_terminated_length": 1462.2, "completions/mean_length": 1018.825, "completions/mean_terminated_length": 1018.825, "completions/min_length": 698.2, "completions/min_terminated_length": 698.2, "entropy": 0.2592238187789917, "epoch": 1.445358401880141, "frac_reward_zero_std": 0.4, "grad_norm": 0.595700740814209, "learning_rate": 3.66610128422583e-07, "loss": -0.0046, "num_tokens": 164830738.0, "reward": 0.6343229413032532, "reward_std": 0.10299613662064075, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6343229413032532, "rewards/e2e_recall_precision_mixed_reward/std": 0.3512304097414017, "sampling/importance_sampling_ratio/max": 1.9878422021865845, "sampling/importance_sampling_ratio/mean": 0.9999564051628113, "sampling/importance_sampling_ratio/min": 0.35334097146987914, "sampling/sampling_logp_difference/max": 1.1131124019622802, "sampling/sampling_logp_difference/mean": 0.012872187793254853, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1467.4, "completions/max_terminated_length": 1467.4, "completions/mean_length": 993.9, "completions/mean_terminated_length": 993.9, "completions/min_length": 724.6, "completions/min_terminated_length": 724.6, "entropy": 0.2649468719959259, "epoch": 1.4512338425381903, "frac_reward_zero_std": 0.5, "grad_norm": 0.6314127445220947, "learning_rate": 3.6600436152168644e-07, "loss": -0.0026, "num_tokens": 165447458.0, "reward": 0.7947916865348816, "reward_std": 0.1078619197010994, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7947916865348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.19745177924633026, "sampling/importance_sampling_ratio/max": 1.977701473236084, "sampling/importance_sampling_ratio/mean": 0.9999303340911865, "sampling/importance_sampling_ratio/min": 0.2354953714646399, "sampling/sampling_logp_difference/max": 2.2229801654815673, "sampling/sampling_logp_difference/mean": 0.012961567752063275, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1326.6, "completions/max_terminated_length": 1326.6, "completions/mean_length": 996.71875, "completions/mean_terminated_length": 996.71875, "completions/min_length": 760.4, "completions/min_terminated_length": 760.4, "entropy": 0.2625953197479248, "epoch": 1.4571092831962398, "frac_reward_zero_std": 0.65, "grad_norm": 0.41131144762039185, "learning_rate": 3.6539859462078993e-07, "loss": 0.001, "num_tokens": 166096216.0, "reward": 0.7678125143051148, "reward_std": 0.09220594316720962, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7678125143051148, "rewards/e2e_recall_precision_mixed_reward/std": 0.23345414102077483, "sampling/importance_sampling_ratio/max": 1.895510697364807, "sampling/importance_sampling_ratio/mean": 0.999940812587738, "sampling/importance_sampling_ratio/min": 0.4072090119123459, "sampling/sampling_logp_difference/max": 0.9523330926895142, "sampling/sampling_logp_difference/mean": 0.01279841959476471, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1585.8, "completions/max_terminated_length": 1548.8, "completions/mean_length": 1040.771875, "completions/mean_terminated_length": 1036.761669921875, "completions/min_length": 738.6, "completions/min_terminated_length": 738.6, "entropy": 0.25499052703380587, "epoch": 1.462984723854289, "frac_reward_zero_std": 0.45, "grad_norm": 0.5272420048713684, "learning_rate": 3.647928277198934e-07, "loss": 0.0015, "num_tokens": 166732619.0, "reward": 0.7338541924953461, "reward_std": 0.1370665103197098, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7338541924953461, "rewards/e2e_recall_precision_mixed_reward/std": 0.2300163447856903, "sampling/importance_sampling_ratio/max": 1.9345684766769409, "sampling/importance_sampling_ratio/mean": 1.0000292301177978, "sampling/importance_sampling_ratio/min": 0.2501420438289642, "sampling/sampling_logp_difference/max": 1.5750776767730712, "sampling/sampling_logp_difference/mean": 0.01261440571397543, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009375, "completions/max_length": 1672.4, "completions/max_terminated_length": 1648.4, "completions/mean_length": 1038.628125, "completions/mean_terminated_length": 1026.3600830078126, "completions/min_length": 766.8, "completions/min_terminated_length": 766.8, "entropy": 0.2597430557012558, "epoch": 1.4688601645123385, "frac_reward_zero_std": 0.45, "grad_norm": 0.6333122253417969, "learning_rate": 3.6418706081899685e-07, "loss": -0.0051, "num_tokens": 167379848.0, "reward": 0.7830729246139526, "reward_std": 0.09931705892086029, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7830729246139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.27932691723108294, "sampling/importance_sampling_ratio/max": 1.9465564727783202, "sampling/importance_sampling_ratio/mean": 1.0000145792961121, "sampling/importance_sampling_ratio/min": 0.33197267055511476, "sampling/sampling_logp_difference/max": 1.224328637123108, "sampling/sampling_logp_difference/mean": 0.012780552357435226, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1333.4, "completions/max_terminated_length": 1333.4, "completions/mean_length": 967.034375, "completions/mean_terminated_length": 967.034375, "completions/min_length": 714.6, "completions/min_terminated_length": 714.6, "entropy": 0.26842235326766967, "epoch": 1.4747356051703877, "frac_reward_zero_std": 0.55, "grad_norm": 0.6638708710670471, "learning_rate": 3.635812939181003e-07, "loss": 0.0041, "num_tokens": 168031283.0, "reward": 0.784583330154419, "reward_std": 0.07790126055479049, "rewards/e2e_recall_precision_mixed_reward/mean": 0.784583330154419, "rewards/e2e_recall_precision_mixed_reward/std": 0.27794828414916994, "sampling/importance_sampling_ratio/max": 1.9674632549285889, "sampling/importance_sampling_ratio/mean": 0.9999781250953674, "sampling/importance_sampling_ratio/min": 0.32500605285167694, "sampling/sampling_logp_difference/max": 1.4220096588134765, "sampling/sampling_logp_difference/mean": 0.013222084194421769, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1456.6, "completions/max_terminated_length": 1456.6, "completions/mean_length": 1003.51875, "completions/mean_terminated_length": 1003.51875, "completions/min_length": 712.6, "completions/min_terminated_length": 712.6, "entropy": 0.2660016596317291, "epoch": 1.4806110458284372, "frac_reward_zero_std": 0.4, "grad_norm": 0.707177996635437, "learning_rate": 3.6297552701720373e-07, "loss": -0.0025, "num_tokens": 168686857.0, "reward": 0.7382812619209289, "reward_std": 0.13373910933732985, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7382812619209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.30144949853420255, "sampling/importance_sampling_ratio/max": 1.9266873836517333, "sampling/importance_sampling_ratio/mean": 1.0000134468078614, "sampling/importance_sampling_ratio/min": 0.26473745703697205, "sampling/sampling_logp_difference/max": 1.5025355339050293, "sampling/sampling_logp_difference/mean": 0.013266277499496937, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1320.4, "completions/max_terminated_length": 1320.4, "completions/mean_length": 961.946875, "completions/mean_terminated_length": 961.946875, "completions/min_length": 756.8, "completions/min_terminated_length": 756.8, "entropy": 0.27312275767326355, "epoch": 1.4864864864864864, "frac_reward_zero_std": 0.45, "grad_norm": 0.7554060816764832, "learning_rate": 3.623697601163072e-07, "loss": 0.0003, "num_tokens": 169318056.0, "reward": 0.8033854246139527, "reward_std": 0.10426819771528244, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8033854246139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.2865228235721588, "sampling/importance_sampling_ratio/max": 1.8720565795898438, "sampling/importance_sampling_ratio/mean": 0.9999788880348206, "sampling/importance_sampling_ratio/min": 0.4627742886543274, "sampling/sampling_logp_difference/max": 0.8917694330215454, "sampling/sampling_logp_difference/mean": 0.01329927183687687, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1299.0, "completions/max_terminated_length": 1299.0, "completions/mean_length": 990.95, "completions/mean_terminated_length": 990.95, "completions/min_length": 752.8, "completions/min_terminated_length": 752.8, "entropy": 0.26616363525390624, "epoch": 1.4923619271445359, "frac_reward_zero_std": 0.5, "grad_norm": 0.5357159376144409, "learning_rate": 3.6176399321541065e-07, "loss": 0.0009, "num_tokens": 169984952.0, "reward": 0.7421875357627868, "reward_std": 0.11765087842941284, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7421875357627868, "rewards/e2e_recall_precision_mixed_reward/std": 0.28093287646770476, "sampling/importance_sampling_ratio/max": 1.8472656726837158, "sampling/importance_sampling_ratio/mean": 1.000068485736847, "sampling/importance_sampling_ratio/min": 0.4106867015361786, "sampling/sampling_logp_difference/max": 0.9185904502868653, "sampling/sampling_logp_difference/mean": 0.013041174784302712, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1387.0, "completions/max_terminated_length": 1387.0, "completions/mean_length": 987.8, "completions/mean_terminated_length": 987.8, "completions/min_length": 690.6, "completions/min_terminated_length": 690.6, "entropy": 0.2711699903011322, "epoch": 1.4982373678025853, "frac_reward_zero_std": 0.4, "grad_norm": 0.6310663223266602, "learning_rate": 3.6115822631451414e-07, "loss": -0.0026, "num_tokens": 170648824.0, "reward": 0.6416666805744171, "reward_std": 0.1222050666809082, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6416666805744171, "rewards/e2e_recall_precision_mixed_reward/std": 0.2899262696504593, "sampling/importance_sampling_ratio/max": 1.9429375410079956, "sampling/importance_sampling_ratio/mean": 1.000006639957428, "sampling/importance_sampling_ratio/min": 0.33936918079853057, "sampling/sampling_logp_difference/max": 1.1747238516807557, "sampling/sampling_logp_difference/mean": 0.013466325402259827, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1381.8, "completions/max_terminated_length": 1381.8, "completions/mean_length": 959.015625, "completions/mean_terminated_length": 959.015625, "completions/min_length": 684.0, "completions/min_terminated_length": 684.0, "entropy": 0.26730274558067324, "epoch": 1.5041128084606346, "frac_reward_zero_std": 0.4, "grad_norm": 0.43127650022506714, "learning_rate": 3.6055245941361763e-07, "loss": 0.0017, "num_tokens": 171280445.0, "reward": 0.7687500059604645, "reward_std": 0.11303048729896545, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7687500059604645, "rewards/e2e_recall_precision_mixed_reward/std": 0.2574013233184814, "sampling/importance_sampling_ratio/max": 1.9692850351333617, "sampling/importance_sampling_ratio/mean": 0.9999760866165162, "sampling/importance_sampling_ratio/min": 0.3867207020521164, "sampling/sampling_logp_difference/max": 1.179562497138977, "sampling/sampling_logp_difference/mean": 0.013569790497422218, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1353.8, "completions/max_terminated_length": 1353.8, "completions/mean_length": 977.803125, "completions/mean_terminated_length": 977.803125, "completions/min_length": 769.2, "completions/min_terminated_length": 769.2, "entropy": 0.2703259289264679, "epoch": 1.5099882491186838, "frac_reward_zero_std": 0.5, "grad_norm": 0.5339919924736023, "learning_rate": 3.5994669251272107e-07, "loss": -0.0022, "num_tokens": 171940318.0, "reward": 0.8276041984558106, "reward_std": 0.10052161514759064, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8276041984558106, "rewards/e2e_recall_precision_mixed_reward/std": 0.24535318613052368, "sampling/importance_sampling_ratio/max": 1.8749458074569703, "sampling/importance_sampling_ratio/mean": 0.9999295115470886, "sampling/importance_sampling_ratio/min": 0.41402388215065, "sampling/sampling_logp_difference/max": 0.8982853889465332, "sampling/sampling_logp_difference/mean": 0.0132589066401124, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1344.6, "completions/max_terminated_length": 1344.6, "completions/mean_length": 993.3, "completions/mean_terminated_length": 993.3, "completions/min_length": 683.2, "completions/min_terminated_length": 683.2, "entropy": 0.2562386393547058, "epoch": 1.5158636897767332, "frac_reward_zero_std": 0.45, "grad_norm": 0.6523430943489075, "learning_rate": 3.5934092561182456e-07, "loss": 0.0026, "num_tokens": 172547102.0, "reward": 0.7180729448795319, "reward_std": 0.11666595637798309, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7180729448795319, "rewards/e2e_recall_precision_mixed_reward/std": 0.30029604732990267, "sampling/importance_sampling_ratio/max": 1.8882590532302856, "sampling/importance_sampling_ratio/mean": 1.0000897526741028, "sampling/importance_sampling_ratio/min": 0.380884712934494, "sampling/sampling_logp_difference/max": 0.9894267439842224, "sampling/sampling_logp_difference/mean": 0.012895303219556809, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1410.6, "completions/max_terminated_length": 1410.6, "completions/mean_length": 1026.284375, "completions/mean_terminated_length": 1026.284375, "completions/min_length": 671.2, "completions/min_terminated_length": 671.2, "entropy": 0.2772479742765427, "epoch": 1.5217391304347827, "frac_reward_zero_std": 0.25, "grad_norm": 0.8672804832458496, "learning_rate": 3.5873515871092805e-07, "loss": 0.0024, "num_tokens": 173226025.0, "reward": 0.8010937690734863, "reward_std": 0.14239197373390197, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8010937690734863, "rewards/e2e_recall_precision_mixed_reward/std": 0.2529195070266724, "sampling/importance_sampling_ratio/max": 1.9743723630905152, "sampling/importance_sampling_ratio/mean": 1.0001447439193725, "sampling/importance_sampling_ratio/min": 0.3725964456796646, "sampling/sampling_logp_difference/max": 1.0943672776222229, "sampling/sampling_logp_difference/mean": 0.013937021978199482, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1239.2, "completions/max_terminated_length": 1239.2, "completions/mean_length": 953.140625, "completions/mean_terminated_length": 953.140625, "completions/min_length": 679.6, "completions/min_terminated_length": 679.6, "entropy": 0.25364493727684023, "epoch": 1.527614571092832, "frac_reward_zero_std": 0.5, "grad_norm": 0.3933258652687073, "learning_rate": 3.581293918100315e-07, "loss": 0.0064, "num_tokens": 173848950.0, "reward": 0.7255208492279053, "reward_std": 0.10795222967863083, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7255208492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.31787977516651156, "sampling/importance_sampling_ratio/max": 1.8450412988662719, "sampling/importance_sampling_ratio/mean": 0.9999418258666992, "sampling/importance_sampling_ratio/min": 0.29344726353883743, "sampling/sampling_logp_difference/max": 1.453646445274353, "sampling/sampling_logp_difference/mean": 0.01269476506859064, "step": 1300 }, { "epoch": 1.527614571092832, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1382.28, "eval_completions/max_terminated_length": 1382.28, "eval_completions/mean_length": 965.419375, "eval_completions/mean_terminated_length": 965.419375, "eval_completions/min_length": 709.76, "eval_completions/min_terminated_length": 709.76, "eval_entropy": 0.26879385590553284, "eval_frac_reward_zero_std": 0.49, "eval_loss": 0.004258011933416128, "eval_num_tokens": 173848950.0, "eval_reward": 0.7047604262828827, "eval_reward_std": 0.10310830242931843, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7047604262828827, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.3197030121088028, "eval_runtime": 367.9737, "eval_samples_per_second": 0.272, "eval_sampling/importance_sampling_ratio/max": 1.9342151737213136, "eval_sampling/importance_sampling_ratio/mean": 1.0000292944908142, "eval_sampling/importance_sampling_ratio/min": 0.37101240634918214, "eval_sampling/sampling_logp_difference/max": 1.128817195892334, "eval_sampling/sampling_logp_difference/mean": 0.013480741195380688, "eval_steps_per_second": 0.005, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1578.0, "completions/max_terminated_length": 1578.0, "completions/mean_length": 1029.65625, "completions/mean_terminated_length": 1029.65625, "completions/min_length": 694.6, "completions/min_terminated_length": 694.6, "entropy": 0.2900865375995636, "epoch": 1.5334900117508812, "frac_reward_zero_std": 0.3, "grad_norm": 0.8898422718048096, "learning_rate": 3.5752362490913497e-07, "loss": 0.0011, "num_tokens": 174539704.0, "reward": 0.6721354484558105, "reward_std": 0.12114289328455925, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6721354484558105, "rewards/e2e_recall_precision_mixed_reward/std": 0.30387861728668214, "sampling/importance_sampling_ratio/max": 1.9696703195571899, "sampling/importance_sampling_ratio/mean": 1.0000692009925842, "sampling/importance_sampling_ratio/min": 0.42663750648498533, "sampling/sampling_logp_difference/max": 0.9719892978668213, "sampling/sampling_logp_difference/mean": 0.0140638317912817, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1501.8, "completions/max_terminated_length": 1501.8, "completions/mean_length": 1011.9, "completions/mean_terminated_length": 1011.9, "completions/min_length": 737.4, "completions/min_terminated_length": 737.4, "entropy": 0.27017735242843627, "epoch": 1.5393654524089306, "frac_reward_zero_std": 0.45, "grad_norm": 0.8569518327713013, "learning_rate": 3.569178580082384e-07, "loss": -0.0016, "num_tokens": 175190648.0, "reward": 0.7643229365348816, "reward_std": 0.11197378635406494, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7643229365348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.24610742479562758, "sampling/importance_sampling_ratio/max": 1.9556582927703858, "sampling/importance_sampling_ratio/mean": 0.9999646782875061, "sampling/importance_sampling_ratio/min": 0.31404358744621275, "sampling/sampling_logp_difference/max": 1.3031816244125367, "sampling/sampling_logp_difference/mean": 0.013381559960544109, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1600.8, "completions/max_terminated_length": 1600.8, "completions/mean_length": 1084.4625, "completions/mean_terminated_length": 1084.4625, "completions/min_length": 728.0, "completions/min_terminated_length": 728.0, "entropy": 0.27942303419113157, "epoch": 1.54524089306698, "frac_reward_zero_std": 0.3, "grad_norm": 0.8738449215888977, "learning_rate": 3.563120911073419e-07, "loss": -0.0044, "num_tokens": 175877324.0, "reward": 0.6853646039962769, "reward_std": 0.15504504293203353, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6853646039962769, "rewards/e2e_recall_precision_mixed_reward/std": 0.33614162504673006, "sampling/importance_sampling_ratio/max": 1.9305254220962524, "sampling/importance_sampling_ratio/mean": 0.9999444603919982, "sampling/importance_sampling_ratio/min": 0.3318605124950409, "sampling/sampling_logp_difference/max": 1.1842295169830321, "sampling/sampling_logp_difference/mean": 0.01380113661289215, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1365.0, "completions/max_terminated_length": 1365.0, "completions/mean_length": 962.596875, "completions/mean_terminated_length": 962.596875, "completions/min_length": 651.4, "completions/min_terminated_length": 651.4, "entropy": 0.2730784237384796, "epoch": 1.5511163337250293, "frac_reward_zero_std": 0.35, "grad_norm": 0.9576377868652344, "learning_rate": 3.557063242064454e-07, "loss": -0.0066, "num_tokens": 176521867.0, "reward": 0.7044270992279053, "reward_std": 0.1317270040512085, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7044270992279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.32034913301467893, "sampling/importance_sampling_ratio/max": 1.9374944686889648, "sampling/importance_sampling_ratio/mean": 0.9999537825584411, "sampling/importance_sampling_ratio/min": 0.37277138531208037, "sampling/sampling_logp_difference/max": 1.428537940979004, "sampling/sampling_logp_difference/mean": 0.013699793815612793, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1501.0, "completions/max_terminated_length": 1434.8, "completions/mean_length": 935.81875, "completions/mean_terminated_length": 931.1066040039062, "completions/min_length": 664.6, "completions/min_terminated_length": 664.6, "entropy": 0.25443568229675295, "epoch": 1.5569917743830788, "frac_reward_zero_std": 0.5, "grad_norm": 0.6914281845092773, "learning_rate": 3.551005573055488e-07, "loss": -0.0028, "num_tokens": 177138973.0, "reward": 0.7851562619209289, "reward_std": 0.10055364742875099, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7851562619209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.2937077939510345, "sampling/importance_sampling_ratio/max": 1.9170387506484985, "sampling/importance_sampling_ratio/mean": 1.0001505136489868, "sampling/importance_sampling_ratio/min": 0.3506089061498642, "sampling/sampling_logp_difference/max": 1.0952062368392945, "sampling/sampling_logp_difference/mean": 0.01276035774499178, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1556.4, "completions/max_terminated_length": 1556.4, "completions/mean_length": 1025.075, "completions/mean_terminated_length": 1025.075, "completions/min_length": 752.4, "completions/min_terminated_length": 752.4, "entropy": 0.2668500870466232, "epoch": 1.5628672150411282, "frac_reward_zero_std": 0.55, "grad_norm": 0.806428849697113, "learning_rate": 3.544947904046523e-07, "loss": -0.0, "num_tokens": 177795797.0, "reward": 0.7680729389190674, "reward_std": 0.08650054633617402, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7680729389190674, "rewards/e2e_recall_precision_mixed_reward/std": 0.27004733085632326, "sampling/importance_sampling_ratio/max": 1.954677987098694, "sampling/importance_sampling_ratio/mean": 1.0000656127929688, "sampling/importance_sampling_ratio/min": 0.35064939856529237, "sampling/sampling_logp_difference/max": 1.1046170234680175, "sampling/sampling_logp_difference/mean": 0.013290046527981759, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1229.8, "completions/max_terminated_length": 1229.8, "completions/mean_length": 947.878125, "completions/mean_terminated_length": 947.878125, "completions/min_length": 675.6, "completions/min_terminated_length": 675.6, "entropy": 0.27370848655700686, "epoch": 1.5687426556991775, "frac_reward_zero_std": 0.4, "grad_norm": 0.763029932975769, "learning_rate": 3.538890235037557e-07, "loss": -0.0025, "num_tokens": 178420750.0, "reward": 0.7265625238418579, "reward_std": 0.14514898210763932, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7265625238418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.289459753036499, "sampling/importance_sampling_ratio/max": 1.9520225524902344, "sampling/importance_sampling_ratio/mean": 1.0000372052192688, "sampling/importance_sampling_ratio/min": 0.31350120902112816, "sampling/sampling_logp_difference/max": 6.1525186419487, "sampling/sampling_logp_difference/mean": 0.013847914896905423, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1468.6, "completions/max_terminated_length": 1468.6, "completions/mean_length": 1013.05625, "completions/mean_terminated_length": 1013.05625, "completions/min_length": 697.4, "completions/min_terminated_length": 697.4, "entropy": 0.2661381125450134, "epoch": 1.5746180963572267, "frac_reward_zero_std": 0.55, "grad_norm": 0.8375189900398254, "learning_rate": 3.532832566028592e-07, "loss": 0.0031, "num_tokens": 179052032.0, "reward": 0.8404687762260437, "reward_std": 0.09039057418704033, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8404687762260437, "rewards/e2e_recall_precision_mixed_reward/std": 0.2524672240018845, "sampling/importance_sampling_ratio/max": 1.9079517602920533, "sampling/importance_sampling_ratio/mean": 0.999993360042572, "sampling/importance_sampling_ratio/min": 0.3867530390620232, "sampling/sampling_logp_difference/max": 1.1689835071563721, "sampling/sampling_logp_difference/mean": 0.013322325237095357, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1527.4, "completions/max_terminated_length": 1341.0, "completions/mean_length": 1034.946875, "completions/mean_terminated_length": 1030.3750732421875, "completions/min_length": 778.0, "completions/min_terminated_length": 778.0, "entropy": 0.261656728386879, "epoch": 1.5804935370152762, "frac_reward_zero_std": 0.5, "grad_norm": 0.13206477463245392, "learning_rate": 3.526774897019627e-07, "loss": -0.0105, "num_tokens": 179727691.0, "reward": 0.7455729246139526, "reward_std": 0.10084965825080872, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7455729246139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.23413763344287872, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999989628791809, "sampling/importance_sampling_ratio/min": 0.38248581886291505, "sampling/sampling_logp_difference/max": 1.209545373916626, "sampling/sampling_logp_difference/mean": 0.013246373273432255, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1276.8, "completions/max_terminated_length": 1276.8, "completions/mean_length": 938.6, "completions/mean_terminated_length": 938.6, "completions/min_length": 676.2, "completions/min_terminated_length": 676.2, "entropy": 0.2708220988512039, "epoch": 1.5863689776733256, "frac_reward_zero_std": 0.55, "grad_norm": 0.8071889877319336, "learning_rate": 3.520717228010661e-07, "loss": 0.0029, "num_tokens": 180349259.0, "reward": 0.7190104246139526, "reward_std": 0.09788908958435058, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7190104246139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.30650861859321593, "sampling/importance_sampling_ratio/max": 1.958139681816101, "sampling/importance_sampling_ratio/mean": 1.0000072836875915, "sampling/importance_sampling_ratio/min": 0.4504124343395233, "sampling/sampling_logp_difference/max": 0.8907589435577392, "sampling/sampling_logp_difference/mean": 0.013682788796722889, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1418.0, "completions/max_terminated_length": 1418.0, "completions/mean_length": 950.903125, "completions/mean_terminated_length": 950.903125, "completions/min_length": 681.2, "completions/min_terminated_length": 681.2, "entropy": 0.2506708770990372, "epoch": 1.5922444183313749, "frac_reward_zero_std": 0.6, "grad_norm": 0.7698414921760559, "learning_rate": 3.514659559001696e-07, "loss": 0.0022, "num_tokens": 180941676.0, "reward": 0.8494791746139526, "reward_std": 0.08073322921991348, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8494791746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.22139372527599335, "sampling/importance_sampling_ratio/max": 1.8772455215454102, "sampling/importance_sampling_ratio/mean": 1.0000471472740173, "sampling/importance_sampling_ratio/min": 0.28111872524023057, "sampling/sampling_logp_difference/max": 1.3952457189559937, "sampling/sampling_logp_difference/mean": 0.012653507106006146, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1521.0, "completions/max_terminated_length": 1521.0, "completions/mean_length": 1009.0875, "completions/mean_terminated_length": 1009.0875, "completions/min_length": 764.6, "completions/min_terminated_length": 764.6, "entropy": 0.258016636967659, "epoch": 1.598119858989424, "frac_reward_zero_std": 0.45, "grad_norm": 0.8082835674285889, "learning_rate": 3.5086018899927304e-07, "loss": 0.0038, "num_tokens": 181581192.0, "reward": 0.6520833492279052, "reward_std": 0.10948452875018119, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6520833492279052, "rewards/e2e_recall_precision_mixed_reward/std": 0.36065220832824707, "sampling/importance_sampling_ratio/max": 1.9668825149536133, "sampling/importance_sampling_ratio/mean": 0.999875009059906, "sampling/importance_sampling_ratio/min": 0.26854347884655, "sampling/sampling_logp_difference/max": 1.6615525960922242, "sampling/sampling_logp_difference/mean": 0.013133746571838856, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1402.0, "completions/max_terminated_length": 1402.0, "completions/mean_length": 964.73125, "completions/mean_terminated_length": 964.73125, "completions/min_length": 638.8, "completions/min_terminated_length": 638.8, "entropy": 0.27279475927352903, "epoch": 1.6039952996474736, "frac_reward_zero_std": 0.45, "grad_norm": 0.37393927574157715, "learning_rate": 3.5025442209837653e-07, "loss": -0.0099, "num_tokens": 182214178.0, "reward": 0.6559896111488343, "reward_std": 0.1184864416718483, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6559896111488343, "rewards/e2e_recall_precision_mixed_reward/std": 0.34022014737129214, "sampling/importance_sampling_ratio/max": 1.981991744041443, "sampling/importance_sampling_ratio/mean": 1.0000012278556825, "sampling/importance_sampling_ratio/min": 0.39412103295326234, "sampling/sampling_logp_difference/max": 0.9828832149505615, "sampling/sampling_logp_difference/mean": 0.013542711734771729, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1341.8, "completions/max_terminated_length": 1341.8, "completions/mean_length": 955.240625, "completions/mean_terminated_length": 955.240625, "completions/min_length": 721.2, "completions/min_terminated_length": 721.2, "entropy": 0.26667892932891846, "epoch": 1.609870740305523, "frac_reward_zero_std": 0.5, "grad_norm": 0.4775325059890747, "learning_rate": 3.4964865519748e-07, "loss": 0.0031, "num_tokens": 182863567.0, "reward": 0.6739583492279053, "reward_std": 0.10213126838207245, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6739583492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.4062064468860626, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000226378440857, "sampling/importance_sampling_ratio/min": 0.44252710342407225, "sampling/sampling_logp_difference/max": 1.2520951271057128, "sampling/sampling_logp_difference/mean": 0.013411963172256947, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1326.6, "completions/max_terminated_length": 1326.6, "completions/mean_length": 983.440625, "completions/mean_terminated_length": 983.440625, "completions/min_length": 761.8, "completions/min_terminated_length": 761.8, "entropy": 0.2573852360248566, "epoch": 1.6157461809635723, "frac_reward_zero_std": 0.3, "grad_norm": 0.726811945438385, "learning_rate": 3.4904288829658345e-07, "loss": -0.0038, "num_tokens": 183484732.0, "reward": 0.7614062547683715, "reward_std": 0.12323106527328491, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7614062547683715, "rewards/e2e_recall_precision_mixed_reward/std": 0.28980146944522855, "sampling/importance_sampling_ratio/max": 1.8720725297927856, "sampling/importance_sampling_ratio/mean": 1.0000526189804078, "sampling/importance_sampling_ratio/min": 0.4041356325149536, "sampling/sampling_logp_difference/max": 1.0532057285308838, "sampling/sampling_logp_difference/mean": 0.012885104678571225, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1204.4, "completions/max_terminated_length": 1204.4, "completions/mean_length": 883.18125, "completions/mean_terminated_length": 883.18125, "completions/min_length": 621.2, "completions/min_terminated_length": 621.2, "entropy": 0.26330329179763795, "epoch": 1.6216216216216215, "frac_reward_zero_std": 0.3, "grad_norm": 0.5486319065093994, "learning_rate": 3.4843712139568694e-07, "loss": -0.001, "num_tokens": 184108406.0, "reward": 0.6694270968437195, "reward_std": 0.14080710634589194, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6694271087646484, "rewards/e2e_recall_precision_mixed_reward/std": 0.3262661784887314, "sampling/importance_sampling_ratio/max": 1.9142040491104126, "sampling/importance_sampling_ratio/mean": 0.9999730348587036, "sampling/importance_sampling_ratio/min": 0.2835851192474365, "sampling/sampling_logp_difference/max": 1.327359104156494, "sampling/sampling_logp_difference/mean": 0.013574123941361904, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1382.6, "completions/max_terminated_length": 1382.6, "completions/mean_length": 973.8375, "completions/mean_terminated_length": 973.8375, "completions/min_length": 697.4, "completions/min_terminated_length": 697.4, "entropy": 0.25520346462726595, "epoch": 1.627497062279671, "frac_reward_zero_std": 0.4, "grad_norm": 0.6697659492492676, "learning_rate": 3.478313544947904e-07, "loss": 0.0007, "num_tokens": 184745202.0, "reward": 0.8057291984558106, "reward_std": 0.10847726836800575, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8057291984558106, "rewards/e2e_recall_precision_mixed_reward/std": 0.2511609926819801, "sampling/importance_sampling_ratio/max": 1.9274680614471436, "sampling/importance_sampling_ratio/mean": 0.9999136447906494, "sampling/importance_sampling_ratio/min": 0.3132738881278783, "sampling/sampling_logp_difference/max": 2.042408013343811, "sampling/sampling_logp_difference/mean": 0.013148021697998048, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1343.2, "completions/max_terminated_length": 1343.2, "completions/mean_length": 983.940625, "completions/mean_terminated_length": 983.940625, "completions/min_length": 770.4, "completions/min_terminated_length": 770.4, "entropy": 0.24816880524158477, "epoch": 1.6333725029377204, "frac_reward_zero_std": 0.65, "grad_norm": 0.8195874094963074, "learning_rate": 3.4722558759389387e-07, "loss": -0.0063, "num_tokens": 185400591.0, "reward": 0.7890625119209289, "reward_std": 0.05731135383248329, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7890625119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.274530765414238, "sampling/importance_sampling_ratio/max": 1.9616381645202636, "sampling/importance_sampling_ratio/mean": 0.9999887347221375, "sampling/importance_sampling_ratio/min": 0.32951097935438156, "sampling/sampling_logp_difference/max": 1.2189922094345094, "sampling/sampling_logp_difference/mean": 0.012603016383945943, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1579.6, "completions/max_terminated_length": 1579.6, "completions/mean_length": 1055.6875, "completions/mean_terminated_length": 1055.6875, "completions/min_length": 822.2, "completions/min_terminated_length": 822.2, "entropy": 0.280494225025177, "epoch": 1.6392479435957696, "frac_reward_zero_std": 0.35, "grad_norm": 0.8461142778396606, "learning_rate": 3.4661982069299736e-07, "loss": 0.0024, "num_tokens": 186054043.0, "reward": 0.6911458611488343, "reward_std": 0.13243722468614577, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6911458611488343, "rewards/e2e_recall_precision_mixed_reward/std": 0.30035166144371034, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000493407249451, "sampling/importance_sampling_ratio/min": 0.3128639668226242, "sampling/sampling_logp_difference/max": 1.3867409229278564, "sampling/sampling_logp_difference/mean": 0.013728627003729343, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1480.4, "completions/max_terminated_length": 1480.4, "completions/mean_length": 1067.021875, "completions/mean_terminated_length": 1067.021875, "completions/min_length": 784.0, "completions/min_terminated_length": 784.0, "entropy": 0.26939132809638977, "epoch": 1.6451233842538189, "frac_reward_zero_std": 0.35, "grad_norm": 0.6929203271865845, "learning_rate": 3.460140537921008e-07, "loss": 0.0062, "num_tokens": 186696866.0, "reward": 0.7593750119209289, "reward_std": 0.14759771823883056, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7593750119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.3351838618516922, "sampling/importance_sampling_ratio/max": 1.9693204164505005, "sampling/importance_sampling_ratio/mean": 0.9999853372573853, "sampling/importance_sampling_ratio/min": 0.2552876703441143, "sampling/sampling_logp_difference/max": 1.6299549341201782, "sampling/sampling_logp_difference/mean": 0.013131172768771649, "step": 1400 }, { "epoch": 1.6451233842538189, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1447.16, "eval_completions/max_terminated_length": 1447.16, "eval_completions/mean_length": 1013.66375, "eval_completions/mean_terminated_length": 1013.66375, "eval_completions/min_length": 747.24, "eval_completions/min_terminated_length": 747.24, "eval_entropy": 0.27061537742614744, "eval_frac_reward_zero_std": 0.47, "eval_loss": 0.0014171568909659982, "eval_num_tokens": 186696866.0, "eval_reward": 0.7115208458900452, "eval_reward_std": 0.1059424777328968, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7115208458900452, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.3199702352285385, "eval_runtime": 391.0468, "eval_samples_per_second": 0.256, "eval_sampling/importance_sampling_ratio/max": 1.9688294887542725, "eval_sampling/importance_sampling_ratio/mean": 0.9999658560752869, "eval_sampling/importance_sampling_ratio/min": 0.3690611620247364, "eval_sampling/sampling_logp_difference/max": 1.168926215171814, "eval_sampling/sampling_logp_difference/mean": 0.013416541777551173, "eval_steps_per_second": 0.005, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1724.6, "completions/max_terminated_length": 1675.6, "completions/mean_length": 1076.90625, "completions/mean_terminated_length": 1072.59599609375, "completions/min_length": 776.0, "completions/min_terminated_length": 776.0, "entropy": 0.2762373864650726, "epoch": 1.6509988249118686, "frac_reward_zero_std": 0.45, "grad_norm": 0.559288501739502, "learning_rate": 3.454082868912043e-07, "loss": -0.0008, "num_tokens": 187377088.0, "reward": 0.6973958492279053, "reward_std": 0.11119076311588287, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6973958492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.3615538030862808, "sampling/importance_sampling_ratio/max": 1.9869262456893921, "sampling/importance_sampling_ratio/mean": 1.0000603914260864, "sampling/importance_sampling_ratio/min": 0.1473228994058445, "sampling/sampling_logp_difference/max": 2.740011477470398, "sampling/sampling_logp_difference/mean": 0.013979729451239108, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1452.4, "completions/max_terminated_length": 1452.4, "completions/mean_length": 1081.03125, "completions/mean_terminated_length": 1081.03125, "completions/min_length": 809.8, "completions/min_terminated_length": 809.8, "entropy": 0.263926637172699, "epoch": 1.6568742655699178, "frac_reward_zero_std": 0.5, "grad_norm": 0.8976117372512817, "learning_rate": 3.448025199903077e-07, "loss": 0.0007, "num_tokens": 188023354.0, "reward": 0.8164062619209289, "reward_std": 0.12308143377304077, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8164062619209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.23725470006465912, "sampling/importance_sampling_ratio/max": 1.9938726425170898, "sampling/importance_sampling_ratio/mean": 1.0001226782798767, "sampling/importance_sampling_ratio/min": 0.27841649786059863, "sampling/sampling_logp_difference/max": 2.92758526802063, "sampling/sampling_logp_difference/mean": 0.013116902112960816, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1446.6, "completions/max_terminated_length": 1446.6, "completions/mean_length": 1022.946875, "completions/mean_terminated_length": 1022.946875, "completions/min_length": 763.4, "completions/min_terminated_length": 763.4, "entropy": 0.27659066915512087, "epoch": 1.662749706227967, "frac_reward_zero_std": 0.2, "grad_norm": 0.9148181080818176, "learning_rate": 3.4419675308941116e-07, "loss": 0.0005, "num_tokens": 188676937.0, "reward": 0.8335937619209289, "reward_std": 0.1323336124420166, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8335937619209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.22035141587257384, "sampling/importance_sampling_ratio/max": 1.8846810340881348, "sampling/importance_sampling_ratio/mean": 0.9998690485954285, "sampling/importance_sampling_ratio/min": 0.3646425485610962, "sampling/sampling_logp_difference/max": 1.169146227836609, "sampling/sampling_logp_difference/mean": 0.013671478442847728, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1598.0, "completions/max_terminated_length": 1598.0, "completions/mean_length": 1139.496875, "completions/mean_terminated_length": 1139.496875, "completions/min_length": 878.8, "completions/min_terminated_length": 878.8, "entropy": 0.2779924929141998, "epoch": 1.6686251468860165, "frac_reward_zero_std": 0.4, "grad_norm": 0.4461043179035187, "learning_rate": 3.4359098618851465e-07, "loss": 0.0037, "num_tokens": 189358488.0, "reward": 0.6711979269981384, "reward_std": 0.13433899730443954, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6711979269981384, "rewards/e2e_recall_precision_mixed_reward/std": 0.3418775200843811, "sampling/importance_sampling_ratio/max": 1.8875983238220215, "sampling/importance_sampling_ratio/mean": 1.000086212158203, "sampling/importance_sampling_ratio/min": 0.3236319288611412, "sampling/sampling_logp_difference/max": 1.3859389901161194, "sampling/sampling_logp_difference/mean": 0.013647865317761899, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 1635.2, "completions/max_terminated_length": 1633.4, "completions/mean_length": 1141.5625, "completions/mean_terminated_length": 1133.90361328125, "completions/min_length": 852.8, "completions/min_terminated_length": 852.8, "entropy": 0.25371613204479215, "epoch": 1.674500587544066, "frac_reward_zero_std": 0.45, "grad_norm": 0.48061123490333557, "learning_rate": 3.429852192876181e-07, "loss": -0.0139, "num_tokens": 190017716.0, "reward": 0.85546875, "reward_std": 0.12050826102495193, "rewards/e2e_recall_precision_mixed_reward/mean": 0.85546875, "rewards/e2e_recall_precision_mixed_reward/std": 0.23054370284080505, "sampling/importance_sampling_ratio/max": 1.9753512620925904, "sampling/importance_sampling_ratio/mean": 1.0000061392784119, "sampling/importance_sampling_ratio/min": 0.33579882979393005, "sampling/sampling_logp_difference/max": 1.2614384889602661, "sampling/sampling_logp_difference/mean": 0.012662022560834884, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 2010.0, "completions/max_terminated_length": 1909.8, "completions/mean_length": 1150.165625, "completions/mean_terminated_length": 1141.9548828125, "completions/min_length": 837.6, "completions/min_terminated_length": 837.6, "entropy": 0.2775812327861786, "epoch": 1.6803760282021152, "frac_reward_zero_std": 0.25, "grad_norm": 0.7772521376609802, "learning_rate": 3.4237945238672157e-07, "loss": -0.0203, "num_tokens": 190713313.0, "reward": 0.7494791746139526, "reward_std": 0.17326337993144988, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7494791865348815, "rewards/e2e_recall_precision_mixed_reward/std": 0.3318782389163971, "sampling/importance_sampling_ratio/max": 1.9999818563461305, "sampling/importance_sampling_ratio/mean": 1.0000026941299438, "sampling/importance_sampling_ratio/min": 0.38341291844844816, "sampling/sampling_logp_difference/max": 1.1000696659088134, "sampling/sampling_logp_difference/mean": 0.013829389959573746, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1708.8, "completions/max_terminated_length": 1708.8, "completions/mean_length": 1123.74375, "completions/mean_terminated_length": 1123.74375, "completions/min_length": 738.0, "completions/min_terminated_length": 738.0, "entropy": 0.2673244297504425, "epoch": 1.6862514688601644, "frac_reward_zero_std": 0.4, "grad_norm": 0.6763787865638733, "learning_rate": 3.41773685485825e-07, "loss": -0.0027, "num_tokens": 191396479.0, "reward": 0.6844791769981384, "reward_std": 0.14983025342226028, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6844791769981384, "rewards/e2e_recall_precision_mixed_reward/std": 0.3457943320274353, "sampling/importance_sampling_ratio/max": 1.9644380331039428, "sampling/importance_sampling_ratio/mean": 1.0000961780548097, "sampling/importance_sampling_ratio/min": 0.3317388445138931, "sampling/sampling_logp_difference/max": 1.1634628534317017, "sampling/sampling_logp_difference/mean": 0.01333068311214447, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1594.6, "completions/max_terminated_length": 1594.6, "completions/mean_length": 1104.1125, "completions/mean_terminated_length": 1104.1125, "completions/min_length": 752.0, "completions/min_terminated_length": 752.0, "entropy": 0.2846644163131714, "epoch": 1.6921269095182139, "frac_reward_zero_std": 0.2, "grad_norm": 0.842131495475769, "learning_rate": 3.411679185849285e-07, "loss": 0.0031, "num_tokens": 192105251.0, "reward": 0.79411461353302, "reward_std": 0.18594035059213637, "rewards/e2e_recall_precision_mixed_reward/mean": 0.79411461353302, "rewards/e2e_recall_precision_mixed_reward/std": 0.29445070028305054, "sampling/importance_sampling_ratio/max": 1.9737318515777589, "sampling/importance_sampling_ratio/mean": 1.000055193901062, "sampling/importance_sampling_ratio/min": 0.3524580836296082, "sampling/sampling_logp_difference/max": 1.214442205429077, "sampling/sampling_logp_difference/mean": 0.01409766599535942, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1492.6, "completions/max_terminated_length": 1492.6, "completions/mean_length": 1088.29375, "completions/mean_terminated_length": 1088.29375, "completions/min_length": 832.8, "completions/min_terminated_length": 832.8, "entropy": 0.24627983570098877, "epoch": 1.6980023501762633, "frac_reward_zero_std": 0.45, "grad_norm": 0.5946608781814575, "learning_rate": 3.40562151684032e-07, "loss": 0.0004, "num_tokens": 192769057.0, "reward": 0.8041666984558106, "reward_std": 0.13151083439588546, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8041666984558106, "rewards/e2e_recall_precision_mixed_reward/std": 0.26574690639972687, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000511169433595, "sampling/importance_sampling_ratio/min": 0.3163315311074257, "sampling/sampling_logp_difference/max": 1.519883394241333, "sampling/sampling_logp_difference/mean": 0.012410031445324422, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1407.6, "completions/max_terminated_length": 1407.6, "completions/mean_length": 1042.9375, "completions/mean_terminated_length": 1042.9375, "completions/min_length": 769.2, "completions/min_terminated_length": 769.2, "entropy": 0.2711980938911438, "epoch": 1.7038777908343126, "frac_reward_zero_std": 0.35, "grad_norm": 0.9581571817398071, "learning_rate": 3.399563847831354e-07, "loss": 0.0065, "num_tokens": 193391677.0, "reward": 0.8053646087646484, "reward_std": 0.10521206557750702, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8053646087646484, "rewards/e2e_recall_precision_mixed_reward/std": 0.24722022712230682, "sampling/importance_sampling_ratio/max": 1.9165439844131469, "sampling/importance_sampling_ratio/mean": 0.9998835325241089, "sampling/importance_sampling_ratio/min": 0.35185267627239225, "sampling/sampling_logp_difference/max": 1.135908579826355, "sampling/sampling_logp_difference/mean": 0.013463702611625194, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1591.8, "completions/max_terminated_length": 1591.8, "completions/mean_length": 1123.79375, "completions/mean_terminated_length": 1123.79375, "completions/min_length": 847.2, "completions/min_terminated_length": 847.2, "entropy": 0.27680361866950987, "epoch": 1.7097532314923618, "frac_reward_zero_std": 0.4, "grad_norm": 0.9067860841751099, "learning_rate": 3.393506178822389e-07, "loss": -0.0007, "num_tokens": 194075291.0, "reward": 0.7929687738418579, "reward_std": 0.1173098023980856, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7929687738418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.27822592854499817, "sampling/importance_sampling_ratio/max": 1.997536540031433, "sampling/importance_sampling_ratio/mean": 0.9999045729637146, "sampling/importance_sampling_ratio/min": 0.24495663307607174, "sampling/sampling_logp_difference/max": 1.8837570667266845, "sampling/sampling_logp_difference/mean": 0.013546660356223584, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1802.6, "completions/max_terminated_length": 1802.6, "completions/mean_length": 1195.10625, "completions/mean_terminated_length": 1195.10625, "completions/min_length": 867.6, "completions/min_terminated_length": 867.6, "entropy": 0.2939670443534851, "epoch": 1.7156286721504113, "frac_reward_zero_std": 0.4, "grad_norm": 0.8876379728317261, "learning_rate": 3.3874485098134235e-07, "loss": 0.0023, "num_tokens": 194772253.0, "reward": 0.8272916793823242, "reward_std": 0.1560825377702713, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8272916913032532, "rewards/e2e_recall_precision_mixed_reward/std": 0.2652295768260956, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000036716461183, "sampling/importance_sampling_ratio/min": 0.34000002443790434, "sampling/sampling_logp_difference/max": 1.2622709274291992, "sampling/sampling_logp_difference/mean": 0.014317047223448754, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 1873.4, "completions/max_terminated_length": 1588.2, "completions/mean_length": 1152.790625, "completions/mean_terminated_length": 1144.909619140625, "completions/min_length": 763.8, "completions/min_terminated_length": 763.8, "entropy": 0.2873714804649353, "epoch": 1.7215041128084607, "frac_reward_zero_std": 0.4, "grad_norm": 0.39559435844421387, "learning_rate": 3.3813908408044584e-07, "loss": -0.0288, "num_tokens": 195478178.0, "reward": 0.8166666984558105, "reward_std": 0.11755480468273163, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8166666984558105, "rewards/e2e_recall_precision_mixed_reward/std": 0.23863519877195358, "sampling/importance_sampling_ratio/max": 1.9812901973724366, "sampling/importance_sampling_ratio/mean": 0.9998369455337525, "sampling/importance_sampling_ratio/min": 0.3511409223079681, "sampling/sampling_logp_difference/max": 1.1258719205856322, "sampling/sampling_logp_difference/mean": 0.014134268276393414, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 1786.4, "completions/max_terminated_length": 1709.2, "completions/mean_length": 1069.403125, "completions/mean_terminated_length": 1060.7073486328125, "completions/min_length": 754.4, "completions/min_terminated_length": 754.4, "entropy": 0.26078082621097565, "epoch": 1.72737955346651, "frac_reward_zero_std": 0.55, "grad_norm": 0.7395367622375488, "learning_rate": 3.3753331717954933e-07, "loss": -0.0151, "num_tokens": 196111723.0, "reward": 0.8067708373069763, "reward_std": 0.09850462116301059, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8067708373069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.2902964472770691, "sampling/importance_sampling_ratio/max": 1.9457304000854492, "sampling/importance_sampling_ratio/mean": 0.9999278068542481, "sampling/importance_sampling_ratio/min": 0.3522989869117737, "sampling/sampling_logp_difference/max": 1.0490608930587768, "sampling/sampling_logp_difference/mean": 0.013238179869949818, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1820.2, "completions/max_terminated_length": 1820.2, "completions/mean_length": 1133.53125, "completions/mean_terminated_length": 1133.53125, "completions/min_length": 744.6, "completions/min_terminated_length": 744.6, "entropy": 0.2913938283920288, "epoch": 1.7332549941245592, "frac_reward_zero_std": 0.45, "grad_norm": 0.725741446018219, "learning_rate": 3.3692755027865276e-07, "loss": -0.0034, "num_tokens": 196785365.0, "reward": 0.8822916865348815, "reward_std": 0.13487583696842192, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8822916865348815, "rewards/e2e_recall_precision_mixed_reward/std": 0.22184424996376037, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001361846923829, "sampling/importance_sampling_ratio/min": 0.23281437605627903, "sampling/sampling_logp_difference/max": 6.875998139381409, "sampling/sampling_logp_difference/mean": 0.014337152801454067, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1587.2, "completions/max_terminated_length": 1587.2, "completions/mean_length": 1084.86875, "completions/mean_terminated_length": 1084.86875, "completions/min_length": 740.6, "completions/min_terminated_length": 740.6, "entropy": 0.2729551553726196, "epoch": 1.7391304347826086, "frac_reward_zero_std": 0.55, "grad_norm": 0.8482347726821899, "learning_rate": 3.3632178337775625e-07, "loss": 0.0076, "num_tokens": 197459147.0, "reward": 0.7274479329586029, "reward_std": 0.0780523905530572, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7274479389190673, "rewards/e2e_recall_precision_mixed_reward/std": 0.2432739406824112, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000293016433717, "sampling/importance_sampling_ratio/min": 0.37133134007453916, "sampling/sampling_logp_difference/max": 1.160807228088379, "sampling/sampling_logp_difference/mean": 0.01364643257111311, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1518.4, "completions/max_terminated_length": 1518.4, "completions/mean_length": 1082.0125, "completions/mean_terminated_length": 1082.0125, "completions/min_length": 796.0, "completions/min_terminated_length": 796.0, "entropy": 0.2662626028060913, "epoch": 1.745005875440658, "frac_reward_zero_std": 0.4, "grad_norm": 0.763447642326355, "learning_rate": 3.357160164768597e-07, "loss": 0.0033, "num_tokens": 198135823.0, "reward": 0.8534896016120911, "reward_std": 0.11715929210186005, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8534896016120911, "rewards/e2e_recall_precision_mixed_reward/std": 0.20295286029577256, "sampling/importance_sampling_ratio/max": 1.937989926338196, "sampling/importance_sampling_ratio/mean": 0.9998504519462585, "sampling/importance_sampling_ratio/min": 0.3984066128730774, "sampling/sampling_logp_difference/max": 1.212566328048706, "sampling/sampling_logp_difference/mean": 0.01346975788474083, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1595.2, "completions/max_terminated_length": 1595.2, "completions/mean_length": 1146.109375, "completions/mean_terminated_length": 1146.109375, "completions/min_length": 805.4, "completions/min_terminated_length": 805.4, "entropy": 0.2847128093242645, "epoch": 1.7508813160987073, "frac_reward_zero_std": 0.45, "grad_norm": 0.5143160223960876, "learning_rate": 3.351102495759632e-07, "loss": 0.0047, "num_tokens": 198828690.0, "reward": 0.6278125166893005, "reward_std": 0.0884520411491394, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6278125166893005, "rewards/e2e_recall_precision_mixed_reward/std": 0.3630589783191681, "sampling/importance_sampling_ratio/max": 1.9360803604125976, "sampling/importance_sampling_ratio/mean": 1.000067901611328, "sampling/importance_sampling_ratio/min": 0.2725113719701767, "sampling/sampling_logp_difference/max": 1.3496559381484985, "sampling/sampling_logp_difference/mean": 0.014137699641287327, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1509.8, "completions/max_terminated_length": 1509.8, "completions/mean_length": 1099.55, "completions/mean_terminated_length": 1099.55, "completions/min_length": 817.4, "completions/min_terminated_length": 817.4, "entropy": 0.2684398263692856, "epoch": 1.7567567567567568, "frac_reward_zero_std": 0.5, "grad_norm": 0.6384820938110352, "learning_rate": 3.3450448267506667e-07, "loss": -0.003, "num_tokens": 199510882.0, "reward": 0.8046875119209289, "reward_std": 0.08579807132482528, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8046875119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.23521889597177506, "sampling/importance_sampling_ratio/max": 1.968173861503601, "sampling/importance_sampling_ratio/mean": 0.9999576926231384, "sampling/importance_sampling_ratio/min": 0.41731377840042116, "sampling/sampling_logp_difference/max": 1.2025867462158204, "sampling/sampling_logp_difference/mean": 0.013431616872549058, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1570.4, "completions/max_terminated_length": 1570.4, "completions/mean_length": 1124.840625, "completions/mean_terminated_length": 1124.840625, "completions/min_length": 745.4, "completions/min_terminated_length": 745.4, "entropy": 0.2712507307529449, "epoch": 1.7626321974148063, "frac_reward_zero_std": 0.3, "grad_norm": 0.7118411064147949, "learning_rate": 3.3389871577417005e-07, "loss": 0.0033, "num_tokens": 200198207.0, "reward": 0.8050000071525574, "reward_std": 0.12036772668361664, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8050000071525574, "rewards/e2e_recall_precision_mixed_reward/std": 0.27572363168001174, "sampling/importance_sampling_ratio/max": 1.9563879728317262, "sampling/importance_sampling_ratio/mean": 1.0000667214393615, "sampling/importance_sampling_ratio/min": 0.44692354202270507, "sampling/sampling_logp_difference/max": 0.8758127689361572, "sampling/sampling_logp_difference/mean": 0.013625680841505528, "step": 1500 }, { "epoch": 1.7626321974148063, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1536.84, "eval_completions/max_terminated_length": 1536.84, "eval_completions/mean_length": 1071.7275, "eval_completions/mean_terminated_length": 1071.7275, "eval_completions/min_length": 773.0, "eval_completions/min_terminated_length": 773.0, "eval_entropy": 0.27730977356433867, "eval_frac_reward_zero_std": 0.58, "eval_loss": 0.0037606186233460903, "eval_num_tokens": 200198207.0, "eval_reward": 0.7363020944595337, "eval_reward_std": 0.09190770551562309, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7363020944595337, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.3109273624420166, "eval_runtime": 415.8157, "eval_samples_per_second": 0.24, "eval_sampling/importance_sampling_ratio/max": 1.9509496116638183, "eval_sampling/importance_sampling_ratio/mean": 0.9999890184402466, "eval_sampling/importance_sampling_ratio/min": 0.34508894979953764, "eval_sampling/sampling_logp_difference/max": 1.3034389925003051, "eval_sampling/sampling_logp_difference/mean": 0.013880596235394478, "eval_steps_per_second": 0.005, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1645.2, "completions/max_terminated_length": 1645.2, "completions/mean_length": 1086.928125, "completions/mean_terminated_length": 1086.928125, "completions/min_length": 794.0, "completions/min_terminated_length": 794.0, "entropy": 0.2728815793991089, "epoch": 1.7685076380728555, "frac_reward_zero_std": 0.4, "grad_norm": 0.8437840938568115, "learning_rate": 3.3329294887327354e-07, "loss": 0.0099, "num_tokens": 200872440.0, "reward": 0.8072916805744171, "reward_std": 0.12046882957220077, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8072916805744171, "rewards/e2e_recall_precision_mixed_reward/std": 0.2373180866241455, "sampling/importance_sampling_ratio/max": 1.99754478931427, "sampling/importance_sampling_ratio/mean": 1.0000806212425233, "sampling/importance_sampling_ratio/min": 0.26217866539955137, "sampling/sampling_logp_difference/max": 1.6371884107589723, "sampling/sampling_logp_difference/mean": 0.013733114674687385, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1595.6, "completions/max_terminated_length": 1595.6, "completions/mean_length": 1065.984375, "completions/mean_terminated_length": 1065.984375, "completions/min_length": 792.4, "completions/min_terminated_length": 792.4, "entropy": 0.26055130958557127, "epoch": 1.7743830787309047, "frac_reward_zero_std": 0.6, "grad_norm": 0.0, "learning_rate": 3.32687181972377e-07, "loss": 0.0001, "num_tokens": 201514851.0, "reward": 0.7830729246139526, "reward_std": 0.0687948226928711, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7830729365348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.2838017612695694, "sampling/importance_sampling_ratio/max": 1.9749235153198241, "sampling/importance_sampling_ratio/mean": 1.0000560760498047, "sampling/importance_sampling_ratio/min": 0.297700959444046, "sampling/sampling_logp_difference/max": 1.359819483757019, "sampling/sampling_logp_difference/mean": 0.01325578261166811, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1431.0, "completions/max_terminated_length": 1431.0, "completions/mean_length": 1057.603125, "completions/mean_terminated_length": 1057.603125, "completions/min_length": 773.6, "completions/min_terminated_length": 773.6, "entropy": 0.255610191822052, "epoch": 1.7802585193889542, "frac_reward_zero_std": 0.4, "grad_norm": 0.430332213640213, "learning_rate": 3.3208141507148047e-07, "loss": 0.0011, "num_tokens": 202164196.0, "reward": 0.7718750238418579, "reward_std": 0.16177449077367784, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7718750238418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.29015558063983915, "sampling/importance_sampling_ratio/max": 1.9674141883850098, "sampling/importance_sampling_ratio/mean": 0.9999878883361817, "sampling/importance_sampling_ratio/min": 0.3030325770378113, "sampling/sampling_logp_difference/max": 1.3088083505630492, "sampling/sampling_logp_difference/mean": 0.01293297652155161, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1647.6, "completions/max_terminated_length": 1647.6, "completions/mean_length": 1092.4125, "completions/mean_terminated_length": 1092.4125, "completions/min_length": 760.8, "completions/min_terminated_length": 760.8, "entropy": 0.2662869393825531, "epoch": 1.7861339600470036, "frac_reward_zero_std": 0.45, "grad_norm": 2.165395736694336, "learning_rate": 3.3147564817058396e-07, "loss": -0.0056, "num_tokens": 202817144.0, "reward": 0.6911458492279052, "reward_std": 0.11217592209577561, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6911458492279052, "rewards/e2e_recall_precision_mixed_reward/std": 0.3047403275966644, "sampling/importance_sampling_ratio/max": 1.9844163179397583, "sampling/importance_sampling_ratio/mean": 1.0000060439109801, "sampling/importance_sampling_ratio/min": 0.40356804728507994, "sampling/sampling_logp_difference/max": 0.9722619771957397, "sampling/sampling_logp_difference/mean": 0.013397721946239472, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 1852.6, "completions/max_terminated_length": 1848.6, "completions/mean_length": 1120.71875, "completions/mean_terminated_length": 1104.1087646484375, "completions/min_length": 716.8, "completions/min_terminated_length": 716.8, "entropy": 0.27589981257915497, "epoch": 1.7920094007050529, "frac_reward_zero_std": 0.35, "grad_norm": 0.6116582155227661, "learning_rate": 3.308698812696874e-07, "loss": -0.0213, "num_tokens": 203493166.0, "reward": 0.6966145992279053, "reward_std": 0.13366687893867493, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6966145992279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.32275949120521547, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000032782554626, "sampling/importance_sampling_ratio/min": 0.3299687564373016, "sampling/sampling_logp_difference/max": 1.2879135131835937, "sampling/sampling_logp_difference/mean": 0.013806664571166038, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1564.8, "completions/max_terminated_length": 1564.8, "completions/mean_length": 1040.4125, "completions/mean_terminated_length": 1040.4125, "completions/min_length": 718.4, "completions/min_terminated_length": 718.4, "entropy": 0.25432821810245515, "epoch": 1.7978848413631021, "frac_reward_zero_std": 0.4, "grad_norm": 0.621759831905365, "learning_rate": 3.302641143687909e-07, "loss": -0.0092, "num_tokens": 204126866.0, "reward": 0.806458342075348, "reward_std": 0.11663768589496612, "rewards/e2e_recall_precision_mixed_reward/mean": 0.806458342075348, "rewards/e2e_recall_precision_mixed_reward/std": 0.2767653793096542, "sampling/importance_sampling_ratio/max": 1.8235158443450927, "sampling/importance_sampling_ratio/mean": 0.9999896287918091, "sampling/importance_sampling_ratio/min": 0.4039942383766174, "sampling/sampling_logp_difference/max": 0.9295182943344116, "sampling/sampling_logp_difference/mean": 0.012854778952896596, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1861.2, "completions/max_terminated_length": 1861.2, "completions/mean_length": 1086.84375, "completions/mean_terminated_length": 1086.84375, "completions/min_length": 759.8, "completions/min_terminated_length": 759.8, "entropy": 0.25863939225673677, "epoch": 1.8037602820211516, "frac_reward_zero_std": 0.3, "grad_norm": 1.2077988386154175, "learning_rate": 3.296583474678943e-07, "loss": -0.0134, "num_tokens": 204804720.0, "reward": 0.7979166746139527, "reward_std": 0.17481858432292938, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7979166865348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.28416181802749635, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999544143676757, "sampling/importance_sampling_ratio/min": 0.38569867610931396, "sampling/sampling_logp_difference/max": 0.9860574126243591, "sampling/sampling_logp_difference/mean": 0.013360159657895564, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1551.0, "completions/max_terminated_length": 1551.0, "completions/mean_length": 1092.853125, "completions/mean_terminated_length": 1092.853125, "completions/min_length": 817.0, "completions/min_terminated_length": 817.0, "entropy": 0.262846040725708, "epoch": 1.809635722679201, "frac_reward_zero_std": 0.5, "grad_norm": 0.6944238543510437, "learning_rate": 3.290525805669978e-07, "loss": 0.0018, "num_tokens": 205475425.0, "reward": 0.7393229246139527, "reward_std": 0.08657962083816528, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7393229246139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.2897666200995445, "sampling/importance_sampling_ratio/max": 1.9605785608291626, "sampling/importance_sampling_ratio/mean": 1.0000004768371582, "sampling/importance_sampling_ratio/min": 0.39530388712882997, "sampling/sampling_logp_difference/max": 0.9769327878952027, "sampling/sampling_logp_difference/mean": 0.013001962006092072, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1548.0, "completions/max_terminated_length": 1548.0, "completions/mean_length": 1128.890625, "completions/mean_terminated_length": 1128.890625, "completions/min_length": 838.0, "completions/min_terminated_length": 838.0, "entropy": 0.27651565670967104, "epoch": 1.8155111633372503, "frac_reward_zero_std": 0.45, "grad_norm": 0.8144307732582092, "learning_rate": 3.284468136661013e-07, "loss": 0.0076, "num_tokens": 206159118.0, "reward": 0.7991666793823242, "reward_std": 0.08892802894115448, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7991666793823242, "rewards/e2e_recall_precision_mixed_reward/std": 0.2119009464979172, "sampling/importance_sampling_ratio/max": 1.920351767539978, "sampling/importance_sampling_ratio/mean": 1.0000345706939697, "sampling/importance_sampling_ratio/min": 0.35048373639583585, "sampling/sampling_logp_difference/max": 1.0617512702941894, "sampling/sampling_logp_difference/mean": 0.013503380306065082, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1675.0, "completions/max_terminated_length": 1675.0, "completions/mean_length": 1168.159375, "completions/mean_terminated_length": 1168.159375, "completions/min_length": 803.2, "completions/min_terminated_length": 803.2, "entropy": 0.2956337988376617, "epoch": 1.8213866039952995, "frac_reward_zero_std": 0.55, "grad_norm": 0.5562376976013184, "learning_rate": 3.2784104676520473e-07, "loss": -0.0029, "num_tokens": 206846977.0, "reward": 0.7964062571525574, "reward_std": 0.07668278813362121, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7964062690734863, "rewards/e2e_recall_precision_mixed_reward/std": 0.3294161468744278, "sampling/importance_sampling_ratio/max": 1.8290458917617798, "sampling/importance_sampling_ratio/mean": 1.000098967552185, "sampling/importance_sampling_ratio/min": 0.3671499669551849, "sampling/sampling_logp_difference/max": 1.039049458503723, "sampling/sampling_logp_difference/mean": 0.014282687194645404, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1794.4, "completions/max_terminated_length": 1794.4, "completions/mean_length": 1118.421875, "completions/mean_terminated_length": 1118.421875, "completions/min_length": 783.4, "completions/min_terminated_length": 783.4, "entropy": 0.2976920962333679, "epoch": 1.827262044653349, "frac_reward_zero_std": 0.5, "grad_norm": 0.7856850624084473, "learning_rate": 3.272352798643082e-07, "loss": 0.0094, "num_tokens": 207543032.0, "reward": 0.651562511920929, "reward_std": 0.12044147849082946, "rewards/e2e_recall_precision_mixed_reward/mean": 0.651562511920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.36984447836875917, "sampling/importance_sampling_ratio/max": 1.9146425247192382, "sampling/importance_sampling_ratio/mean": 0.9998530745506287, "sampling/importance_sampling_ratio/min": 0.3384554922580719, "sampling/sampling_logp_difference/max": 1.096525502204895, "sampling/sampling_logp_difference/mean": 0.014470845647156238, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1517.4, "completions/max_terminated_length": 1517.4, "completions/mean_length": 1085.29375, "completions/mean_terminated_length": 1085.29375, "completions/min_length": 749.2, "completions/min_terminated_length": 749.2, "entropy": 0.27524412870407106, "epoch": 1.8331374853113984, "frac_reward_zero_std": 0.4, "grad_norm": 0.7174109816551208, "learning_rate": 3.2662951296341166e-07, "loss": 0.0095, "num_tokens": 208243862.0, "reward": 0.7854166746139526, "reward_std": 0.11581210866570472, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7854166746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.30429266691207885, "sampling/importance_sampling_ratio/max": 1.9975382328033446, "sampling/importance_sampling_ratio/mean": 0.9999500274658203, "sampling/importance_sampling_ratio/min": 0.3587868869304657, "sampling/sampling_logp_difference/max": 1.1987864136695863, "sampling/sampling_logp_difference/mean": 0.013580608554184437, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1499.4, "completions/max_terminated_length": 1499.4, "completions/mean_length": 1119.565625, "completions/mean_terminated_length": 1119.565625, "completions/min_length": 858.6, "completions/min_terminated_length": 858.6, "entropy": 0.2701830804347992, "epoch": 1.8390129259694477, "frac_reward_zero_std": 0.45, "grad_norm": 0.8073204159736633, "learning_rate": 3.2602374606251515e-07, "loss": 0.0011, "num_tokens": 208959259.0, "reward": 0.8133854150772095, "reward_std": 0.11158336699008942, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8133854150772095, "rewards/e2e_recall_precision_mixed_reward/std": 0.2828989580273628, "sampling/importance_sampling_ratio/max": 1.943230438232422, "sampling/importance_sampling_ratio/mean": 0.999999463558197, "sampling/importance_sampling_ratio/min": 0.2029788501560688, "sampling/sampling_logp_difference/max": 1.7986388444900512, "sampling/sampling_logp_difference/mean": 0.013567885570228099, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1445.0, "completions/max_terminated_length": 1445.0, "completions/mean_length": 1044.534375, "completions/mean_terminated_length": 1044.534375, "completions/min_length": 792.6, "completions/min_terminated_length": 792.6, "entropy": 0.27079087495803833, "epoch": 1.8448883666274971, "frac_reward_zero_std": 0.6, "grad_norm": 0.0, "learning_rate": 3.2541797916161864e-07, "loss": 0.0025, "num_tokens": 209613382.0, "reward": 0.810937511920929, "reward_std": 0.06994951367378235, "rewards/e2e_recall_precision_mixed_reward/mean": 0.810937511920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.244219571352005, "sampling/importance_sampling_ratio/max": 1.8565600395202637, "sampling/importance_sampling_ratio/mean": 1.0000513553619386, "sampling/importance_sampling_ratio/min": 0.34990236461162566, "sampling/sampling_logp_difference/max": 1.088843870162964, "sampling/sampling_logp_difference/mean": 0.013324829936027526, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1507.4, "completions/max_terminated_length": 1507.4, "completions/mean_length": 1063.225, "completions/mean_terminated_length": 1063.225, "completions/min_length": 785.8, "completions/min_terminated_length": 785.8, "entropy": 0.26387048363685606, "epoch": 1.8507638072855466, "frac_reward_zero_std": 0.55, "grad_norm": 0.773414671421051, "learning_rate": 3.2481221226072207e-07, "loss": -0.0025, "num_tokens": 210242766.0, "reward": 0.934375, "reward_std": 0.09152774214744568, "rewards/e2e_recall_precision_mixed_reward/mean": 0.934375, "rewards/e2e_recall_precision_mixed_reward/std": 0.13311923742294313, "sampling/importance_sampling_ratio/max": 1.9508730173110962, "sampling/importance_sampling_ratio/mean": 0.9999862909317017, "sampling/importance_sampling_ratio/min": 0.3466633170843124, "sampling/sampling_logp_difference/max": 1.1064417600631713, "sampling/sampling_logp_difference/mean": 0.01319211684167385, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1640.4, "completions/max_terminated_length": 1640.4, "completions/mean_length": 1128.60625, "completions/mean_terminated_length": 1128.60625, "completions/min_length": 818.8, "completions/min_terminated_length": 818.8, "entropy": 0.2660370707511902, "epoch": 1.8566392479435958, "frac_reward_zero_std": 0.3, "grad_norm": 0.6501742005348206, "learning_rate": 3.242064453598255e-07, "loss": 0.0035, "num_tokens": 210934624.0, "reward": 0.6548958420753479, "reward_std": 0.12596461772918702, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6548958420753479, "rewards/e2e_recall_precision_mixed_reward/std": 0.2918025851249695, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999019145965576, "sampling/importance_sampling_ratio/min": 0.2749210774898529, "sampling/sampling_logp_difference/max": 1.399231457710266, "sampling/sampling_logp_difference/mean": 0.01334780901670456, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1491.4, "completions/max_terminated_length": 1491.4, "completions/mean_length": 1057.3125, "completions/mean_terminated_length": 1057.3125, "completions/min_length": 761.6, "completions/min_terminated_length": 761.6, "entropy": 0.262117275595665, "epoch": 1.862514688601645, "frac_reward_zero_std": 0.45, "grad_norm": 2.2436435222625732, "learning_rate": 3.2360067845892895e-07, "loss": 0.0087, "num_tokens": 211605716.0, "reward": 0.7901041805744171, "reward_std": 0.10960776507854461, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7901041805744171, "rewards/e2e_recall_precision_mixed_reward/std": 0.2129432439804077, "sampling/importance_sampling_ratio/max": 1.9343363523483277, "sampling/importance_sampling_ratio/mean": 0.9999404549598694, "sampling/importance_sampling_ratio/min": 0.35822451710700987, "sampling/sampling_logp_difference/max": 1.0487784624099732, "sampling/sampling_logp_difference/mean": 0.013386520184576511, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1622.0, "completions/max_terminated_length": 1622.0, "completions/mean_length": 1065.365625, "completions/mean_terminated_length": 1065.365625, "completions/min_length": 718.6, "completions/min_terminated_length": 718.6, "entropy": 0.27746407985687255, "epoch": 1.8683901292596945, "frac_reward_zero_std": 0.55, "grad_norm": 0.5856614112854004, "learning_rate": 3.2299491155803244e-07, "loss": 0.004, "num_tokens": 212305273.0, "reward": 0.7411458492279053, "reward_std": 0.084406515955925, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7411458492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.2793776974081993, "sampling/importance_sampling_ratio/max": 1.961573100090027, "sampling/importance_sampling_ratio/mean": 0.999928104877472, "sampling/importance_sampling_ratio/min": 0.40686691403388975, "sampling/sampling_logp_difference/max": 1.0171378612518311, "sampling/sampling_logp_difference/mean": 0.013991770520806312, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1656.6, "completions/max_terminated_length": 1656.6, "completions/mean_length": 1115.678125, "completions/mean_terminated_length": 1115.678125, "completions/min_length": 834.0, "completions/min_terminated_length": 834.0, "entropy": 0.24829670786857605, "epoch": 1.874265569917744, "frac_reward_zero_std": 0.5, "grad_norm": 0.8266233205795288, "learning_rate": 3.223891446571359e-07, "loss": 0.0043, "num_tokens": 212961378.0, "reward": 0.8059895873069763, "reward_std": 0.10123835355043412, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8059895873069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.24552496075630187, "sampling/importance_sampling_ratio/max": 1.7967779159545898, "sampling/importance_sampling_ratio/mean": 0.9999753475189209, "sampling/importance_sampling_ratio/min": 0.30345211625099183, "sampling/sampling_logp_difference/max": 1.3853749752044677, "sampling/sampling_logp_difference/mean": 0.012761880829930305, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1525.8, "completions/max_terminated_length": 1525.8, "completions/mean_length": 1050.45, "completions/mean_terminated_length": 1050.45, "completions/min_length": 756.4, "completions/min_terminated_length": 756.4, "entropy": 0.25351338386535643, "epoch": 1.8801410105757932, "frac_reward_zero_std": 0.6, "grad_norm": 0.5988499522209167, "learning_rate": 3.2178337775623936e-07, "loss": 0.0034, "num_tokens": 213594482.0, "reward": 0.9263020992279053, "reward_std": 0.06761249005794526, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9263020992279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.11542957425117492, "sampling/importance_sampling_ratio/max": 1.9589691400527953, "sampling/importance_sampling_ratio/mean": 1.0000545501708984, "sampling/importance_sampling_ratio/min": 0.34012679755687714, "sampling/sampling_logp_difference/max": 1.2352878332138062, "sampling/sampling_logp_difference/mean": 0.012775789014995098, "step": 1600 }, { "epoch": 1.8801410105757932, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1435.84, "eval_completions/max_terminated_length": 1435.84, "eval_completions/mean_length": 1044.866875, "eval_completions/mean_terminated_length": 1044.866875, "eval_completions/min_length": 772.36, "eval_completions/min_terminated_length": 772.36, "eval_entropy": 0.2674348741769791, "eval_frac_reward_zero_std": 0.53, "eval_loss": -0.001374118379317224, "eval_num_tokens": 213594482.0, "eval_reward": 0.7270000171661377, "eval_reward_std": 0.09518681436777116, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7270000171661377, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.31534139573574066, "eval_runtime": 396.0164, "eval_samples_per_second": 0.253, "eval_sampling/importance_sampling_ratio/max": 1.9521161270141603, "eval_sampling/importance_sampling_ratio/mean": 0.9999828863143921, "eval_sampling/importance_sampling_ratio/min": 0.37689221899025144, "eval_sampling/sampling_logp_difference/max": 1.2735300946235657, "eval_sampling/sampling_logp_difference/mean": 0.013533079214394093, "eval_steps_per_second": 0.005, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1400.6, "completions/max_terminated_length": 1400.6, "completions/mean_length": 1069.303125, "completions/mean_terminated_length": 1069.303125, "completions/min_length": 842.2, "completions/min_terminated_length": 842.2, "entropy": 0.24568533301353454, "epoch": 1.8860164512338424, "frac_reward_zero_std": 0.45, "grad_norm": 0.45789584517478943, "learning_rate": 3.2117761085534285e-07, "loss": 0.0016, "num_tokens": 214247363.0, "reward": 0.7768229246139526, "reward_std": 0.08649568557739258, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7768229246139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.23062911927700042, "sampling/importance_sampling_ratio/max": 1.925460934638977, "sampling/importance_sampling_ratio/mean": 1.0000849604606628, "sampling/importance_sampling_ratio/min": 0.31908697783946993, "sampling/sampling_logp_difference/max": 1.8175645828247071, "sampling/sampling_logp_difference/mean": 0.012399931252002717, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1458.8, "completions/max_terminated_length": 1458.8, "completions/mean_length": 1065.69375, "completions/mean_terminated_length": 1065.69375, "completions/min_length": 838.4, "completions/min_terminated_length": 838.4, "entropy": 0.2598737061023712, "epoch": 1.8918918918918919, "frac_reward_zero_std": 0.65, "grad_norm": 0.0, "learning_rate": 3.205718439544463e-07, "loss": 0.0041, "num_tokens": 214855345.0, "reward": 0.8633854150772095, "reward_std": 0.06079368144273758, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8633854150772095, "rewards/e2e_recall_precision_mixed_reward/std": 0.18320930004119873, "sampling/importance_sampling_ratio/max": 1.9000773906707764, "sampling/importance_sampling_ratio/mean": 1.0001107931137085, "sampling/importance_sampling_ratio/min": 0.425476199388504, "sampling/sampling_logp_difference/max": 0.9314059376716614, "sampling/sampling_logp_difference/mean": 0.0131002776324749, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 1672.2, "completions/max_terminated_length": 1612.6, "completions/mean_length": 1148.871875, "completions/mean_terminated_length": 1140.6226806640625, "completions/min_length": 830.2, "completions/min_terminated_length": 830.2, "entropy": 0.2641796410083771, "epoch": 1.8977673325499413, "frac_reward_zero_std": 0.45, "grad_norm": 0.7945494055747986, "learning_rate": 3.199660770535498e-07, "loss": -0.0085, "num_tokens": 215521120.0, "reward": 0.7901041746139527, "reward_std": 0.12795178554952144, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7901041746139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.2533832848072052, "sampling/importance_sampling_ratio/max": 1.9268634796142579, "sampling/importance_sampling_ratio/mean": 1.0000725269317627, "sampling/importance_sampling_ratio/min": 0.3661342471837997, "sampling/sampling_logp_difference/max": 1.1095021486282348, "sampling/sampling_logp_difference/mean": 0.013217655010521412, "step": 1615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1666.8, "completions/max_terminated_length": 1666.8, "completions/mean_length": 1044.421875, "completions/mean_terminated_length": 1044.421875, "completions/min_length": 700.6, "completions/min_terminated_length": 700.6, "entropy": 0.26596803069114683, "epoch": 1.9036427732079906, "frac_reward_zero_std": 0.4, "grad_norm": 0.857772707939148, "learning_rate": 3.1936031015265327e-07, "loss": 0.0036, "num_tokens": 216141047.0, "reward": 0.8739062666893005, "reward_std": 0.12695073261857032, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8739062666893005, "rewards/e2e_recall_precision_mixed_reward/std": 0.2268179625272751, "sampling/importance_sampling_ratio/max": 1.8997068881988526, "sampling/importance_sampling_ratio/mean": 0.9999036669731141, "sampling/importance_sampling_ratio/min": 0.3743781954050064, "sampling/sampling_logp_difference/max": 1.1093499660491943, "sampling/sampling_logp_difference/mean": 0.013530664332211017, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1535.8, "completions/max_terminated_length": 1535.8, "completions/mean_length": 1037.45625, "completions/mean_terminated_length": 1037.45625, "completions/min_length": 765.4, "completions/min_terminated_length": 765.4, "entropy": 0.2603359043598175, "epoch": 1.9095182138660398, "frac_reward_zero_std": 0.65, "grad_norm": 0.5619133710861206, "learning_rate": 3.187545432517567e-07, "loss": -0.0101, "num_tokens": 216803305.0, "reward": 0.9479166865348816, "reward_std": 0.06181866824626923, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9479166865348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.13064087331295013, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001649498939513, "sampling/importance_sampling_ratio/min": 0.3292679309844971, "sampling/sampling_logp_difference/max": 1.2311968326568603, "sampling/sampling_logp_difference/mean": 0.013423211500048637, "step": 1625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1532.4, "completions/max_terminated_length": 1532.4, "completions/mean_length": 1074.621875, "completions/mean_terminated_length": 1074.621875, "completions/min_length": 813.6, "completions/min_terminated_length": 813.6, "entropy": 0.2620278686285019, "epoch": 1.9153936545240893, "frac_reward_zero_std": 0.4, "grad_norm": 0.7254646420478821, "learning_rate": 3.181487763508602e-07, "loss": -0.0052, "num_tokens": 217482080.0, "reward": 0.7119271039962769, "reward_std": 0.11240836530923844, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7119271039962769, "rewards/e2e_recall_precision_mixed_reward/std": 0.3277711272239685, "sampling/importance_sampling_ratio/max": 1.9904924154281616, "sampling/importance_sampling_ratio/mean": 1.0000413656234741, "sampling/importance_sampling_ratio/min": 0.28767693042755127, "sampling/sampling_logp_difference/max": 1.4912013769149781, "sampling/sampling_logp_difference/mean": 0.013511856086552143, "step": 1630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1490.4, "completions/max_terminated_length": 1490.4, "completions/mean_length": 1064.96875, "completions/mean_terminated_length": 1064.96875, "completions/min_length": 776.0, "completions/min_terminated_length": 776.0, "entropy": 0.26446402370929717, "epoch": 1.9212690951821387, "frac_reward_zero_std": 0.35, "grad_norm": 0.6776900291442871, "learning_rate": 3.1754300944996363e-07, "loss": 0.0053, "num_tokens": 218150838.0, "reward": 0.7807291984558106, "reward_std": 0.13026501089334488, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7807291984558106, "rewards/e2e_recall_precision_mixed_reward/std": 0.2612376809120178, "sampling/importance_sampling_ratio/max": 1.939541721343994, "sampling/importance_sampling_ratio/mean": 0.9999086499214173, "sampling/importance_sampling_ratio/min": 0.40064749121665955, "sampling/sampling_logp_difference/max": 0.9914682865142822, "sampling/sampling_logp_difference/mean": 0.01353347897529602, "step": 1635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1573.8, "completions/max_terminated_length": 1573.8, "completions/mean_length": 1073.8125, "completions/mean_terminated_length": 1073.8125, "completions/min_length": 699.4, "completions/min_terminated_length": 699.4, "entropy": 0.25780443847179413, "epoch": 1.927144535840188, "frac_reward_zero_std": 0.5, "grad_norm": 0.7870806455612183, "learning_rate": 3.169372425490671e-07, "loss": 0.0056, "num_tokens": 218791034.0, "reward": 0.8082812786102295, "reward_std": 0.07447053156793118, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8082812786102295, "rewards/e2e_recall_precision_mixed_reward/std": 0.3176675528287888, "sampling/importance_sampling_ratio/max": 1.9315118789672852, "sampling/importance_sampling_ratio/mean": 0.999969232082367, "sampling/importance_sampling_ratio/min": 0.32335387766361234, "sampling/sampling_logp_difference/max": 1.1687919616699218, "sampling/sampling_logp_difference/mean": 0.013280937634408473, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1408.8, "completions/max_terminated_length": 1408.8, "completions/mean_length": 1076.375, "completions/mean_terminated_length": 1076.375, "completions/min_length": 748.8, "completions/min_terminated_length": 748.8, "entropy": 0.27145981788635254, "epoch": 1.9330199764982372, "frac_reward_zero_std": 0.45, "grad_norm": 0.6460449695587158, "learning_rate": 3.163314756481706e-07, "loss": -0.0059, "num_tokens": 219458946.0, "reward": 0.8421354293823242, "reward_std": 0.09370578080415726, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8421354293823242, "rewards/e2e_recall_precision_mixed_reward/std": 0.24492353498935698, "sampling/importance_sampling_ratio/max": 1.9178395748138428, "sampling/importance_sampling_ratio/mean": 0.9999516487121582, "sampling/importance_sampling_ratio/min": 0.33379133939743044, "sampling/sampling_logp_difference/max": 1.2747412323951721, "sampling/sampling_logp_difference/mean": 0.01361453216522932, "step": 1645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1519.0, "completions/max_terminated_length": 1519.0, "completions/mean_length": 1094.121875, "completions/mean_terminated_length": 1094.121875, "completions/min_length": 839.6, "completions/min_terminated_length": 839.6, "entropy": 0.2646895945072174, "epoch": 1.9388954171562869, "frac_reward_zero_std": 0.25, "grad_norm": 0.7939705848693848, "learning_rate": 3.1572570874727404e-07, "loss": -0.0069, "num_tokens": 220152905.0, "reward": 0.7620833396911622, "reward_std": 0.13086363822221755, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7620833396911622, "rewards/e2e_recall_precision_mixed_reward/std": 0.28319459557533266, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000337362289429, "sampling/importance_sampling_ratio/min": 0.29829355180263517, "sampling/sampling_logp_difference/max": 1.419735050201416, "sampling/sampling_logp_difference/mean": 0.013573858141899108, "step": 1650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1786.4, "completions/max_terminated_length": 1773.8, "completions/mean_length": 1110.184375, "completions/mean_terminated_length": 1106.42705078125, "completions/min_length": 776.8, "completions/min_terminated_length": 776.8, "entropy": 0.2640155255794525, "epoch": 1.9447708578143361, "frac_reward_zero_std": 0.45, "grad_norm": 0.47779330611228943, "learning_rate": 3.1511994184637753e-07, "loss": -0.0012, "num_tokens": 220831344.0, "reward": 0.7302083492279052, "reward_std": 0.11683603897690772, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7302083611488343, "rewards/e2e_recall_precision_mixed_reward/std": 0.38854368925094607, "sampling/importance_sampling_ratio/max": 1.9117161750793457, "sampling/importance_sampling_ratio/mean": 0.9999454617500305, "sampling/importance_sampling_ratio/min": 0.288709232211113, "sampling/sampling_logp_difference/max": 1.4568031072616576, "sampling/sampling_logp_difference/mean": 0.013713906332850457, "step": 1655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1509.4, "completions/max_terminated_length": 1509.4, "completions/mean_length": 1074.428125, "completions/mean_terminated_length": 1074.428125, "completions/min_length": 821.4, "completions/min_terminated_length": 821.4, "entropy": 0.263632670044899, "epoch": 1.9506462984723854, "frac_reward_zero_std": 0.45, "grad_norm": 0.5578112602233887, "learning_rate": 3.1451417494548097e-07, "loss": 0.0022, "num_tokens": 221479513.0, "reward": 0.8171875238418579, "reward_std": 0.09892857819795609, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8171875238418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.23545759916305542, "sampling/importance_sampling_ratio/max": 1.907174038887024, "sampling/importance_sampling_ratio/mean": 1.000023603439331, "sampling/importance_sampling_ratio/min": 0.3447980388998985, "sampling/sampling_logp_difference/max": 1.2826555967330933, "sampling/sampling_logp_difference/mean": 0.013302036374807358, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1636.0, "completions/max_terminated_length": 1636.0, "completions/mean_length": 1165.56875, "completions/mean_terminated_length": 1165.56875, "completions/min_length": 824.0, "completions/min_terminated_length": 824.0, "entropy": 0.27500487565994264, "epoch": 1.9565217391304348, "frac_reward_zero_std": 0.45, "grad_norm": 0.7453766465187073, "learning_rate": 3.139084080445844e-07, "loss": -0.0032, "num_tokens": 222190751.0, "reward": 0.6704166889190674, "reward_std": 0.12143459171056747, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6704166889190674, "rewards/e2e_recall_precision_mixed_reward/std": 0.3668683707714081, "sampling/importance_sampling_ratio/max": 1.9511921644210815, "sampling/importance_sampling_ratio/mean": 1.0000686645507812, "sampling/importance_sampling_ratio/min": 0.26080631613731386, "sampling/sampling_logp_difference/max": 1.390922975540161, "sampling/sampling_logp_difference/mean": 0.01381862722337246, "step": 1665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1588.8, "completions/max_terminated_length": 1588.8, "completions/mean_length": 1106.275, "completions/mean_terminated_length": 1106.275, "completions/min_length": 848.2, "completions/min_terminated_length": 848.2, "entropy": 0.2858578205108643, "epoch": 1.9623971797884843, "frac_reward_zero_std": 0.4, "grad_norm": 0.7552258372306824, "learning_rate": 3.133026411436879e-07, "loss": 0.0018, "num_tokens": 222842407.0, "reward": 0.8156250238418579, "reward_std": 0.12306551039218902, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8156250238418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.2660649374127388, "sampling/importance_sampling_ratio/max": 1.9279718160629273, "sampling/importance_sampling_ratio/mean": 1.0000049114227294, "sampling/importance_sampling_ratio/min": 0.18268027827143668, "sampling/sampling_logp_difference/max": 1.923844289779663, "sampling/sampling_logp_difference/mean": 0.0143051628023386, "step": 1670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1776.0, "completions/max_terminated_length": 1776.0, "completions/mean_length": 1201.125, "completions/mean_terminated_length": 1201.125, "completions/min_length": 876.2, "completions/min_terminated_length": 876.2, "entropy": 0.2890251398086548, "epoch": 1.9682726204465335, "frac_reward_zero_std": 0.45, "grad_norm": 0.6220123171806335, "learning_rate": 3.1269687424279133e-07, "loss": -0.0051, "num_tokens": 223545759.0, "reward": 0.7908854246139526, "reward_std": 0.10933632254600525, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7908854365348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.29638255536556246, "sampling/importance_sampling_ratio/max": 1.9763683319091796, "sampling/importance_sampling_ratio/mean": 1.000083565711975, "sampling/importance_sampling_ratio/min": 0.35314607322216035, "sampling/sampling_logp_difference/max": 1.0618009567260742, "sampling/sampling_logp_difference/mean": 0.014300593733787536, "step": 1675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 1733.0, "completions/max_terminated_length": 1696.0, "completions/mean_length": 1136.7125, "completions/mean_terminated_length": 1127.995556640625, "completions/min_length": 782.4, "completions/min_terminated_length": 782.4, "entropy": 0.2694635778665543, "epoch": 1.9741480611045827, "frac_reward_zero_std": 0.55, "grad_norm": 0.4368976056575775, "learning_rate": 3.120911073418948e-07, "loss": -0.0173, "num_tokens": 224206443.0, "reward": 0.8182291746139526, "reward_std": 0.07583726048469544, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8182291746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.25061969757080077, "sampling/importance_sampling_ratio/max": 1.8513876676559449, "sampling/importance_sampling_ratio/mean": 0.9999567866325378, "sampling/importance_sampling_ratio/min": 0.2878729492425919, "sampling/sampling_logp_difference/max": 1.3490334510803224, "sampling/sampling_logp_difference/mean": 0.013676924258470535, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1541.2, "completions/max_terminated_length": 1541.2, "completions/mean_length": 1131.984375, "completions/mean_terminated_length": 1131.984375, "completions/min_length": 867.0, "completions/min_terminated_length": 867.0, "entropy": 0.2611004739999771, "epoch": 1.9800235017626322, "frac_reward_zero_std": 0.55, "grad_norm": 0.5511998534202576, "learning_rate": 3.1148534044099826e-07, "loss": -0.0031, "num_tokens": 224880918.0, "reward": 0.798645842075348, "reward_std": 0.08196402341127396, "rewards/e2e_recall_precision_mixed_reward/mean": 0.798645842075348, "rewards/e2e_recall_precision_mixed_reward/std": 0.25188693404197693, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000040054321289, "sampling/importance_sampling_ratio/min": 0.3580232530832291, "sampling/sampling_logp_difference/max": 1.0668410778045654, "sampling/sampling_logp_difference/mean": 0.013273421488702298, "step": 1685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1610.0, "completions/max_terminated_length": 1610.0, "completions/mean_length": 1153.4875, "completions/mean_terminated_length": 1153.4875, "completions/min_length": 816.2, "completions/min_terminated_length": 816.2, "entropy": 0.29184606671333313, "epoch": 1.9858989424206817, "frac_reward_zero_std": 0.55, "grad_norm": 0.4519566595554352, "learning_rate": 3.1087957354010175e-07, "loss": 0.004, "num_tokens": 225568706.0, "reward": 0.817187511920929, "reward_std": 0.0991033136844635, "rewards/e2e_recall_precision_mixed_reward/mean": 0.817187511920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.28155410587787627, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999716281890869, "sampling/importance_sampling_ratio/min": 0.31005223616957667, "sampling/sampling_logp_difference/max": 1.4509097576141357, "sampling/sampling_logp_difference/mean": 0.014347866736352443, "step": 1690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1595.0, "completions/max_terminated_length": 1595.0, "completions/mean_length": 1157.546875, "completions/mean_terminated_length": 1157.546875, "completions/min_length": 833.8, "completions/min_terminated_length": 833.8, "entropy": 0.272691935300827, "epoch": 1.991774383078731, "frac_reward_zero_std": 0.55, "grad_norm": 0.48185354471206665, "learning_rate": 3.1027380663920524e-07, "loss": 0.0018, "num_tokens": 226276833.0, "reward": 0.8002604246139526, "reward_std": 0.07418519258499146, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8002604365348815, "rewards/e2e_recall_precision_mixed_reward/std": 0.31211295127868655, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999703764915466, "sampling/importance_sampling_ratio/min": 0.24243721812963487, "sampling/sampling_logp_difference/max": 1.4976640701293946, "sampling/sampling_logp_difference/mean": 0.013997980579733848, "step": 1695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1685.6, "completions/max_terminated_length": 1685.6, "completions/mean_length": 1165.35625, "completions/mean_terminated_length": 1165.35625, "completions/min_length": 845.4, "completions/min_terminated_length": 845.4, "entropy": 0.2845953702926636, "epoch": 1.9976498237367801, "frac_reward_zero_std": 0.4, "grad_norm": 0.9591985940933228, "learning_rate": 3.0966803973830867e-07, "loss": 0.0079, "num_tokens": 227020035.0, "reward": 0.6323958516120911, "reward_std": 0.1353075310587883, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6323958516120911, "rewards/e2e_recall_precision_mixed_reward/std": 0.3693024396896362, "sampling/importance_sampling_ratio/max": 1.9484627723693848, "sampling/importance_sampling_ratio/mean": 0.9999523758888245, "sampling/importance_sampling_ratio/min": 0.44701356887817384, "sampling/sampling_logp_difference/max": 1.0415231466293335, "sampling/sampling_logp_difference/mean": 0.014189911261200904, "step": 1700 }, { "epoch": 1.9976498237367801, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1661.84, "eval_completions/max_terminated_length": 1661.84, "eval_completions/mean_length": 1161.033125, "eval_completions/mean_terminated_length": 1161.033125, "eval_completions/min_length": 846.36, "eval_completions/min_terminated_length": 846.36, "eval_entropy": 0.2866858780384064, "eval_frac_reward_zero_std": 0.58, "eval_loss": 0.004300988744944334, "eval_num_tokens": 227020035.0, "eval_reward": 0.7361770963668823, "eval_reward_std": 0.08369863875210286, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7361770963668823, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.31068264067173, "eval_runtime": 451.7625, "eval_samples_per_second": 0.221, "eval_sampling/importance_sampling_ratio/max": 1.9099996519088744, "eval_sampling/importance_sampling_ratio/mean": 1.000064604282379, "eval_sampling/importance_sampling_ratio/min": 0.36046827347949145, "eval_sampling/sampling_logp_difference/max": 1.241805739402771, "eval_sampling/sampling_logp_difference/mean": 0.014108292125165463, "eval_steps_per_second": 0.004, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1656.6, "completions/max_terminated_length": 1656.6, "completions/mean_length": 1174.23125, "completions/mean_terminated_length": 1174.23125, "completions/min_length": 818.8, "completions/min_terminated_length": 818.8, "entropy": 0.2785020112991333, "epoch": 2.00352526439483, "frac_reward_zero_std": 0.45, "grad_norm": 0.871213436126709, "learning_rate": 3.0906227283741216e-07, "loss": 0.0072, "num_tokens": 227740653.0, "reward": 0.7760416746139527, "reward_std": 0.10184991657733918, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7760416746139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.31230961382389066, "sampling/importance_sampling_ratio/max": 1.9423685312271117, "sampling/importance_sampling_ratio/mean": 1.0000242710113525, "sampling/importance_sampling_ratio/min": 0.38754723966121674, "sampling/sampling_logp_difference/max": 1.279812264442444, "sampling/sampling_logp_difference/mean": 0.013877030275762082, "step": 1705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1814.2, "completions/max_terminated_length": 1814.2, "completions/mean_length": 1234.190625, "completions/mean_terminated_length": 1234.190625, "completions/min_length": 928.4, "completions/min_terminated_length": 928.4, "entropy": 0.29070239663124087, "epoch": 2.009400705052879, "frac_reward_zero_std": 0.6, "grad_norm": 0.5816195011138916, "learning_rate": 3.084565059365156e-07, "loss": 0.0039, "num_tokens": 228487386.0, "reward": 0.8880208492279053, "reward_std": 0.08002980649471284, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8880208492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.20378222912549973, "sampling/importance_sampling_ratio/max": 1.9560601711273193, "sampling/importance_sampling_ratio/mean": 1.0000455379486084, "sampling/importance_sampling_ratio/min": 0.26356355398893355, "sampling/sampling_logp_difference/max": 1.8925941467285157, "sampling/sampling_logp_difference/mean": 0.01430057343095541, "step": 1710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 2000.6, "completions/max_terminated_length": 1791.6, "completions/mean_length": 1251.0375, "completions/mean_terminated_length": 1243.4466552734375, "completions/min_length": 902.8, "completions/min_terminated_length": 902.8, "entropy": 0.28459187150001525, "epoch": 2.0152761457109283, "frac_reward_zero_std": 0.5, "grad_norm": 0.5703986883163452, "learning_rate": 3.078507390356191e-07, "loss": -0.0065, "num_tokens": 229218670.0, "reward": 0.7763020873069764, "reward_std": 0.10708752572536469, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7763020873069764, "rewards/e2e_recall_precision_mixed_reward/std": 0.3236579895019531, "sampling/importance_sampling_ratio/max": 1.9888316869735718, "sampling/importance_sampling_ratio/mean": 1.000116801261902, "sampling/importance_sampling_ratio/min": 0.32361292839050293, "sampling/sampling_logp_difference/max": 1.2249764919281005, "sampling/sampling_logp_difference/mean": 0.014165903627872466, "step": 1715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1804.8, "completions/max_terminated_length": 1804.8, "completions/mean_length": 1247.0375, "completions/mean_terminated_length": 1247.0375, "completions/min_length": 867.8, "completions/min_terminated_length": 867.8, "entropy": 0.29395602345466615, "epoch": 2.0211515863689775, "frac_reward_zero_std": 0.4, "grad_norm": 0.7805261611938477, "learning_rate": 3.072449721347226e-07, "loss": -0.0015, "num_tokens": 229942954.0, "reward": 0.9083333492279053, "reward_std": 0.11098030507564545, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9083333492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.17272518202662468, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999336123466491, "sampling/importance_sampling_ratio/min": 0.35569711625576017, "sampling/sampling_logp_difference/max": 1.0804112911224366, "sampling/sampling_logp_difference/mean": 0.01464350800961256, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1867.0, "completions/max_terminated_length": 1867.0, "completions/mean_length": 1240.828125, "completions/mean_terminated_length": 1240.828125, "completions/min_length": 846.4, "completions/min_terminated_length": 846.4, "entropy": 0.28558818697929383, "epoch": 2.027027027027027, "frac_reward_zero_std": 0.5, "grad_norm": 0.4648728668689728, "learning_rate": 3.06639205233826e-07, "loss": 0.003, "num_tokens": 230669475.0, "reward": 0.848437511920929, "reward_std": 0.1070944607257843, "rewards/e2e_recall_precision_mixed_reward/mean": 0.848437511920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.25224395394325255, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000049948692322, "sampling/importance_sampling_ratio/min": 0.31880449652671816, "sampling/sampling_logp_difference/max": 1.497407031059265, "sampling/sampling_logp_difference/mean": 0.014051317609846592, "step": 1725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1756.0, "completions/max_terminated_length": 1756.0, "completions/mean_length": 1243.184375, "completions/mean_terminated_length": 1243.184375, "completions/min_length": 934.2, "completions/min_terminated_length": 934.2, "entropy": 0.2831280082464218, "epoch": 2.0329024676850764, "frac_reward_zero_std": 0.45, "grad_norm": 0.7145424485206604, "learning_rate": 3.060334383329295e-07, "loss": -0.0021, "num_tokens": 231379070.0, "reward": 0.7850000143051148, "reward_std": 0.10773698166012764, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7850000143051148, "rewards/e2e_recall_precision_mixed_reward/std": 0.3063537538051605, "sampling/importance_sampling_ratio/max": 1.9744776964187623, "sampling/importance_sampling_ratio/mean": 0.9998899936676026, "sampling/importance_sampling_ratio/min": 0.27880694568157194, "sampling/sampling_logp_difference/max": 1.4340813398361205, "sampling/sampling_logp_difference/mean": 0.013902221620082856, "step": 1730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1843.4, "completions/max_terminated_length": 1843.4, "completions/mean_length": 1300.975, "completions/mean_terminated_length": 1300.975, "completions/min_length": 938.4, "completions/min_terminated_length": 938.4, "entropy": 0.2918659746646881, "epoch": 2.0387779083431257, "frac_reward_zero_std": 0.4, "grad_norm": 0.6800016760826111, "learning_rate": 3.0542767143203294e-07, "loss": 0.0042, "num_tokens": 232105430.0, "reward": 0.6939583420753479, "reward_std": 0.12365827858448028, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6939583420753479, "rewards/e2e_recall_precision_mixed_reward/std": 0.30231705904006956, "sampling/importance_sampling_ratio/max": 1.9620429277420044, "sampling/importance_sampling_ratio/mean": 0.9999876499176026, "sampling/importance_sampling_ratio/min": 0.3293029397726059, "sampling/sampling_logp_difference/max": 1.3565265893936158, "sampling/sampling_logp_difference/mean": 0.01423037126660347, "step": 1735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1720.0, "completions/max_terminated_length": 1720.0, "completions/mean_length": 1240.153125, "completions/mean_terminated_length": 1240.153125, "completions/min_length": 877.0, "completions/min_terminated_length": 877.0, "entropy": 0.298739355802536, "epoch": 2.044653349001175, "frac_reward_zero_std": 0.25, "grad_norm": 0.5738968253135681, "learning_rate": 3.0482190453113643e-07, "loss": 0.0037, "num_tokens": 232801799.0, "reward": 0.8646875262260437, "reward_std": 0.14013182669878005, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8646875262260437, "rewards/e2e_recall_precision_mixed_reward/std": 0.22773447036743164, "sampling/importance_sampling_ratio/max": 1.963773488998413, "sampling/importance_sampling_ratio/mean": 1.0000531673431396, "sampling/importance_sampling_ratio/min": 0.3736042261123657, "sampling/sampling_logp_difference/max": 1.0158153533935548, "sampling/sampling_logp_difference/mean": 0.01465737223625183, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1745.2, "completions/max_terminated_length": 1745.2, "completions/mean_length": 1227.05625, "completions/mean_terminated_length": 1227.05625, "completions/min_length": 891.2, "completions/min_terminated_length": 891.2, "entropy": 0.2842448323965073, "epoch": 2.0505287896592246, "frac_reward_zero_std": 0.5, "grad_norm": 0.5650268793106079, "learning_rate": 3.0421613763023986e-07, "loss": 0.0065, "num_tokens": 233537097.0, "reward": 0.8348958373069764, "reward_std": 0.08204673230648041, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8348958492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.2639793872833252, "sampling/importance_sampling_ratio/max": 1.9064828634262085, "sampling/importance_sampling_ratio/mean": 1.0000011801719666, "sampling/importance_sampling_ratio/min": 0.3541615068912506, "sampling/sampling_logp_difference/max": 1.1652244329452515, "sampling/sampling_logp_difference/mean": 0.014108946174383163, "step": 1745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1584.2, "completions/max_terminated_length": 1584.2, "completions/mean_length": 1150.646875, "completions/mean_terminated_length": 1150.646875, "completions/min_length": 816.2, "completions/min_terminated_length": 816.2, "entropy": 0.28300136923789976, "epoch": 2.056404230317274, "frac_reward_zero_std": 0.45, "grad_norm": 0.721430242061615, "learning_rate": 3.036103707293433e-07, "loss": 0.0096, "num_tokens": 234240840.0, "reward": 0.8783854365348815, "reward_std": 0.09260771721601486, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8783854365348815, "rewards/e2e_recall_precision_mixed_reward/std": 0.23209483027458191, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999852180480957, "sampling/importance_sampling_ratio/min": 0.27702509611845016, "sampling/sampling_logp_difference/max": 1.4427910327911377, "sampling/sampling_logp_difference/mean": 0.014436537213623524, "step": 1750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1795.6, "completions/max_terminated_length": 1795.6, "completions/mean_length": 1341.33125, "completions/mean_terminated_length": 1341.33125, "completions/min_length": 961.6, "completions/min_terminated_length": 961.6, "entropy": 0.2973055899143219, "epoch": 2.062279670975323, "frac_reward_zero_std": 0.4, "grad_norm": 0.7037875056266785, "learning_rate": 3.030046038284468e-07, "loss": 0.0024, "num_tokens": 235011698.0, "reward": 0.8645833492279053, "reward_std": 0.10491203367710114, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8645833492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.2077777147293091, "sampling/importance_sampling_ratio/max": 1.8878745079040526, "sampling/importance_sampling_ratio/mean": 0.9999104261398315, "sampling/importance_sampling_ratio/min": 0.27942183911800383, "sampling/sampling_logp_difference/max": 1.43536217212677, "sampling/sampling_logp_difference/mean": 0.014530559442937374, "step": 1755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1723.2, "completions/max_terminated_length": 1723.2, "completions/mean_length": 1207.046875, "completions/mean_terminated_length": 1207.046875, "completions/min_length": 901.0, "completions/min_terminated_length": 901.0, "entropy": 0.28618607819080355, "epoch": 2.0681551116333723, "frac_reward_zero_std": 0.65, "grad_norm": 0.4199642837047577, "learning_rate": 3.023988369275502e-07, "loss": -0.0002, "num_tokens": 235711745.0, "reward": 0.7817708373069763, "reward_std": 0.06876285523176193, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7817708373069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.24962895214557648, "sampling/importance_sampling_ratio/max": 1.9418248176574706, "sampling/importance_sampling_ratio/mean": 0.9999440431594848, "sampling/importance_sampling_ratio/min": 0.2679483711719513, "sampling/sampling_logp_difference/max": 1.456888699531555, "sampling/sampling_logp_difference/mean": 0.014030390232801438, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1463.4, "completions/max_terminated_length": 1463.4, "completions/mean_length": 1105.29375, "completions/mean_terminated_length": 1105.29375, "completions/min_length": 835.4, "completions/min_terminated_length": 835.4, "entropy": 0.2893945574760437, "epoch": 2.074030552291422, "frac_reward_zero_std": 0.45, "grad_norm": 0.7728146314620972, "learning_rate": 3.017930700266537e-07, "loss": 0.0075, "num_tokens": 236373455.0, "reward": 0.7203125178813934, "reward_std": 0.09382694512605667, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7203125178813934, "rewards/e2e_recall_precision_mixed_reward/std": 0.25324483811855314, "sampling/importance_sampling_ratio/max": 1.998999786376953, "sampling/importance_sampling_ratio/mean": 0.9999812364578247, "sampling/importance_sampling_ratio/min": 0.25601265765726566, "sampling/sampling_logp_difference/max": 2.0012857913970947, "sampling/sampling_logp_difference/mean": 0.014354320801794529, "step": 1765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1613.4, "completions/max_terminated_length": 1613.4, "completions/mean_length": 1165.15625, "completions/mean_terminated_length": 1165.15625, "completions/min_length": 871.0, "completions/min_terminated_length": 871.0, "entropy": 0.271235328912735, "epoch": 2.079905992949471, "frac_reward_zero_std": 0.55, "grad_norm": 0.6383501887321472, "learning_rate": 3.011873031257572e-07, "loss": 0.0011, "num_tokens": 237069169.0, "reward": 0.7666146039962769, "reward_std": 0.09578602537512779, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7666146039962769, "rewards/e2e_recall_precision_mixed_reward/std": 0.3138491868972778, "sampling/importance_sampling_ratio/max": 1.8899298191070557, "sampling/importance_sampling_ratio/mean": 0.9999959945678711, "sampling/importance_sampling_ratio/min": 0.24239584915339946, "sampling/sampling_logp_difference/max": 1.8334716081619262, "sampling/sampling_logp_difference/mean": 0.013750493712723254, "step": 1770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1592.8, "completions/max_terminated_length": 1592.8, "completions/mean_length": 1175.80625, "completions/mean_terminated_length": 1175.80625, "completions/min_length": 837.0, "completions/min_terminated_length": 837.0, "entropy": 0.277604877948761, "epoch": 2.0857814336075204, "frac_reward_zero_std": 0.4, "grad_norm": 0.7303882241249084, "learning_rate": 3.0058153622486064e-07, "loss": 0.0019, "num_tokens": 237758851.0, "reward": 0.7031250119209289, "reward_std": 0.12986374348402024, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7031250238418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.3014916732907295, "sampling/importance_sampling_ratio/max": 1.9738976955413818, "sampling/importance_sampling_ratio/mean": 1.0001816511154176, "sampling/importance_sampling_ratio/min": 0.37753416895866393, "sampling/sampling_logp_difference/max": 1.017386221885681, "sampling/sampling_logp_difference/mean": 0.01398746259510517, "step": 1775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1622.6, "completions/max_terminated_length": 1622.6, "completions/mean_length": 1112.41875, "completions/mean_terminated_length": 1112.41875, "completions/min_length": 820.0, "completions/min_terminated_length": 820.0, "entropy": 0.25624861419200895, "epoch": 2.09165687426557, "frac_reward_zero_std": 0.55, "grad_norm": 0.76189124584198, "learning_rate": 2.9997576932396413e-07, "loss": 0.0029, "num_tokens": 238420953.0, "reward": 0.8820833444595337, "reward_std": 0.11259067952632903, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8820833563804626, "rewards/e2e_recall_precision_mixed_reward/std": 0.23160261511802674, "sampling/importance_sampling_ratio/max": 1.8961010932922364, "sampling/importance_sampling_ratio/mean": 1.0000606298446655, "sampling/importance_sampling_ratio/min": 0.33668322265148165, "sampling/sampling_logp_difference/max": 1.1480276823043822, "sampling/sampling_logp_difference/mean": 0.013318390399217606, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1628.4, "completions/max_terminated_length": 1628.4, "completions/mean_length": 1162.1875, "completions/mean_terminated_length": 1162.1875, "completions/min_length": 878.0, "completions/min_terminated_length": 878.0, "entropy": 0.2680640757083893, "epoch": 2.0975323149236194, "frac_reward_zero_std": 0.55, "grad_norm": 0.5976382493972778, "learning_rate": 2.9937000242306757e-07, "loss": -0.0004, "num_tokens": 239096837.0, "reward": 0.775781261920929, "reward_std": 0.1137162283062935, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7757812738418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.32021387219429015, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001160860061646, "sampling/importance_sampling_ratio/min": 0.3333402812480927, "sampling/sampling_logp_difference/max": 1.1439990282058716, "sampling/sampling_logp_difference/mean": 0.01385085079818964, "step": 1785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1503.2, "completions/max_terminated_length": 1503.2, "completions/mean_length": 1124.1125, "completions/mean_terminated_length": 1124.1125, "completions/min_length": 875.2, "completions/min_terminated_length": 875.2, "entropy": 0.2674242615699768, "epoch": 2.1034077555816686, "frac_reward_zero_std": 0.65, "grad_norm": 0.0, "learning_rate": 2.9876423552217106e-07, "loss": 0.0022, "num_tokens": 239770009.0, "reward": 0.7948437690734863, "reward_std": 0.059596112370491026, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7948437690734863, "rewards/e2e_recall_precision_mixed_reward/std": 0.26889737248420714, "sampling/importance_sampling_ratio/max": 1.8605868816375732, "sampling/importance_sampling_ratio/mean": 1.0000072717666626, "sampling/importance_sampling_ratio/min": 0.3218778297305107, "sampling/sampling_logp_difference/max": 1.3112666606903076, "sampling/sampling_logp_difference/mean": 0.013513725996017457, "step": 1790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1477.6, "completions/max_terminated_length": 1477.6, "completions/mean_length": 1035.5875, "completions/mean_terminated_length": 1035.5875, "completions/min_length": 697.8, "completions/min_terminated_length": 697.8, "entropy": 0.24554576575756074, "epoch": 2.109283196239718, "frac_reward_zero_std": 0.6, "grad_norm": 0.6205167174339294, "learning_rate": 2.9815846862127455e-07, "loss": 0.0004, "num_tokens": 240438149.0, "reward": 0.7682291746139527, "reward_std": 0.0869694009423256, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7682291746139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.3167840033769608, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999816417694092, "sampling/importance_sampling_ratio/min": 0.39744282960891725, "sampling/sampling_logp_difference/max": 1.184657597541809, "sampling/sampling_logp_difference/mean": 0.013029644265770912, "step": 1795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1595.2, "completions/max_terminated_length": 1595.2, "completions/mean_length": 1139.05, "completions/mean_terminated_length": 1139.05, "completions/min_length": 796.6, "completions/min_terminated_length": 796.6, "entropy": 0.27050881683826444, "epoch": 2.1151586368977675, "frac_reward_zero_std": 0.35, "grad_norm": 0.6889888048171997, "learning_rate": 2.97552701720378e-07, "loss": 0.0004, "num_tokens": 241151557.0, "reward": 0.7417187571525574, "reward_std": 0.157486841827631, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7417187690734863, "rewards/e2e_recall_precision_mixed_reward/std": 0.32728450894355776, "sampling/importance_sampling_ratio/max": 1.932526707649231, "sampling/importance_sampling_ratio/mean": 1.0000963449478149, "sampling/importance_sampling_ratio/min": 0.346428656578064, "sampling/sampling_logp_difference/max": 1.278468632698059, "sampling/sampling_logp_difference/mean": 0.01399834081530571, "step": 1800 }, { "epoch": 2.1151586368977675, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1458.0, "eval_completions/max_terminated_length": 1458.0, "eval_completions/mean_length": 1057.511875, "eval_completions/mean_terminated_length": 1057.511875, "eval_completions/min_length": 779.68, "eval_completions/min_terminated_length": 779.68, "eval_entropy": 0.25933306515216825, "eval_frac_reward_zero_std": 0.56, "eval_loss": 0.005401823669672012, "eval_num_tokens": 241151557.0, "eval_reward": 0.729833345413208, "eval_reward_std": 0.08868716955184937, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.729833345413208, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.3078808444738388, "eval_runtime": 406.5719, "eval_samples_per_second": 0.246, "eval_sampling/importance_sampling_ratio/max": 1.945242338180542, "eval_sampling/importance_sampling_ratio/mean": 1.0000135159492494, "eval_sampling/importance_sampling_ratio/min": 0.3708919485433554, "eval_sampling/sampling_logp_difference/max": 1.7694089603424072, "eval_sampling/sampling_logp_difference/mean": 0.013369039855897427, "eval_steps_per_second": 0.005, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1519.4, "completions/max_terminated_length": 1519.4, "completions/mean_length": 1092.36875, "completions/mean_terminated_length": 1092.36875, "completions/min_length": 736.4, "completions/min_terminated_length": 736.4, "entropy": 0.24719617068767546, "epoch": 2.1210340775558167, "frac_reward_zero_std": 0.45, "grad_norm": 0.0, "learning_rate": 2.9694693481948147e-07, "loss": -0.0047, "num_tokens": 241802075.0, "reward": 0.7389062762260437, "reward_std": 0.10075291693210602, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7389062762260437, "rewards/e2e_recall_precision_mixed_reward/std": 0.30727340281009674, "sampling/importance_sampling_ratio/max": 1.9569374561309814, "sampling/importance_sampling_ratio/mean": 0.9999837636947632, "sampling/importance_sampling_ratio/min": 0.3740126609802246, "sampling/sampling_logp_difference/max": 1.0094846487045288, "sampling/sampling_logp_difference/mean": 0.01285008229315281, "step": 1805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1400.8, "completions/max_terminated_length": 1400.8, "completions/mean_length": 1044.871875, "completions/mean_terminated_length": 1044.871875, "completions/min_length": 732.0, "completions/min_terminated_length": 732.0, "entropy": 0.24336313009262084, "epoch": 2.126909518213866, "frac_reward_zero_std": 0.55, "grad_norm": 0.4582900106906891, "learning_rate": 2.963411679185849e-07, "loss": 0.0039, "num_tokens": 242455362.0, "reward": 0.8213541746139527, "reward_std": 0.08186697289347648, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8213541686534882, "rewards/e2e_recall_precision_mixed_reward/std": 0.21659500002861024, "sampling/importance_sampling_ratio/max": 1.9403943777084351, "sampling/importance_sampling_ratio/mean": 0.9999736547470093, "sampling/importance_sampling_ratio/min": 0.2570742294192314, "sampling/sampling_logp_difference/max": 1.5891379356384276, "sampling/sampling_logp_difference/mean": 0.013036347925662994, "step": 1810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1502.8, "completions/max_terminated_length": 1502.8, "completions/mean_length": 1106.55625, "completions/mean_terminated_length": 1106.55625, "completions/min_length": 841.4, "completions/min_terminated_length": 841.4, "entropy": 0.25111518502235414, "epoch": 2.132784958871915, "frac_reward_zero_std": 0.45, "grad_norm": 0.6569045186042786, "learning_rate": 2.957354010176884e-07, "loss": -0.0027, "num_tokens": 243131412.0, "reward": 0.8187500238418579, "reward_std": 0.09741906523704529, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8187500238418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.2573035418987274, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001561999320985, "sampling/importance_sampling_ratio/min": 0.29898915837402457, "sampling/sampling_logp_difference/max": 2.6158031940460207, "sampling/sampling_logp_difference/mean": 0.013235697895288468, "step": 1815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1578.2, "completions/max_terminated_length": 1578.2, "completions/mean_length": 1109.54375, "completions/mean_terminated_length": 1109.54375, "completions/min_length": 816.6, "completions/min_terminated_length": 816.6, "entropy": 0.2552741885185242, "epoch": 2.138660399529965, "frac_reward_zero_std": 0.2, "grad_norm": 0.8629297018051147, "learning_rate": 2.951296341167919e-07, "loss": 0.0104, "num_tokens": 243818002.0, "reward": 0.8177083611488343, "reward_std": 0.16347545087337495, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8177083611488343, "rewards/e2e_recall_precision_mixed_reward/std": 0.2514489233493805, "sampling/importance_sampling_ratio/max": 1.9939273834228515, "sampling/importance_sampling_ratio/mean": 0.9999308586120605, "sampling/importance_sampling_ratio/min": 0.27947772494808304, "sampling/sampling_logp_difference/max": 2.8928178787231444, "sampling/sampling_logp_difference/mean": 0.013579142279922962, "step": 1820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1492.6, "completions/max_terminated_length": 1492.6, "completions/mean_length": 1132.365625, "completions/mean_terminated_length": 1132.365625, "completions/min_length": 836.8, "completions/min_terminated_length": 836.8, "entropy": 0.24973794519901277, "epoch": 2.144535840188014, "frac_reward_zero_std": 0.25, "grad_norm": 0.7742500305175781, "learning_rate": 2.9452386721589527e-07, "loss": 0.003, "num_tokens": 244494887.0, "reward": 0.8229166984558105, "reward_std": 0.1541348308324814, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8229166984558105, "rewards/e2e_recall_precision_mixed_reward/std": 0.2586108475923538, "sampling/importance_sampling_ratio/max": 1.8992940664291382, "sampling/importance_sampling_ratio/mean": 1.0000396728515626, "sampling/importance_sampling_ratio/min": 0.41788731813430785, "sampling/sampling_logp_difference/max": 0.883245873451233, "sampling/sampling_logp_difference/mean": 0.013097657822072506, "step": 1825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1717.2, "completions/max_terminated_length": 1717.2, "completions/mean_length": 1177.0, "completions/mean_terminated_length": 1177.0, "completions/min_length": 850.2, "completions/min_terminated_length": 850.2, "entropy": 0.26592395901679994, "epoch": 2.1504112808460634, "frac_reward_zero_std": 0.4, "grad_norm": 0.8587401509284973, "learning_rate": 2.9391810031499876e-07, "loss": -0.0011, "num_tokens": 245227175.0, "reward": 0.7255208551883697, "reward_std": 0.1329521119594574, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7255208551883697, "rewards/e2e_recall_precision_mixed_reward/std": 0.29523513913154603, "sampling/importance_sampling_ratio/max": 1.981991744041443, "sampling/importance_sampling_ratio/mean": 0.9999511480331421, "sampling/importance_sampling_ratio/min": 0.29395384937524793, "sampling/sampling_logp_difference/max": 1.3618286371231079, "sampling/sampling_logp_difference/mean": 0.014108171686530113, "step": 1830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1515.2, "completions/max_terminated_length": 1515.2, "completions/mean_length": 1118.55, "completions/mean_terminated_length": 1118.55, "completions/min_length": 856.0, "completions/min_terminated_length": 856.0, "entropy": 0.23380440473556519, "epoch": 2.1562867215041126, "frac_reward_zero_std": 0.5, "grad_norm": 0.8163400888442993, "learning_rate": 2.933123334141022e-07, "loss": 0.0026, "num_tokens": 245905831.0, "reward": 0.9013021111488342, "reward_std": 0.10105133950710296, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9013021111488342, "rewards/e2e_recall_precision_mixed_reward/std": 0.16864475458860398, "sampling/importance_sampling_ratio/max": 1.9898765087127686, "sampling/importance_sampling_ratio/mean": 1.000045382976532, "sampling/importance_sampling_ratio/min": 0.25343484356999396, "sampling/sampling_logp_difference/max": 1.7442261219024657, "sampling/sampling_logp_difference/mean": 0.012684360519051552, "step": 1835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1327.0, "completions/max_terminated_length": 1327.0, "completions/mean_length": 1012.203125, "completions/mean_terminated_length": 1012.203125, "completions/min_length": 780.2, "completions/min_terminated_length": 780.2, "entropy": 0.2402132272720337, "epoch": 2.1621621621621623, "frac_reward_zero_std": 0.5, "grad_norm": 0.8790619969367981, "learning_rate": 2.927065665132057e-07, "loss": 0.0047, "num_tokens": 246525720.0, "reward": 0.8596354246139526, "reward_std": 0.09730460494756699, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8596354365348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.25916803777217867, "sampling/importance_sampling_ratio/max": 1.9251169681549072, "sampling/importance_sampling_ratio/mean": 1.000064241886139, "sampling/importance_sampling_ratio/min": 0.3161891311407089, "sampling/sampling_logp_difference/max": 1.4003211498260497, "sampling/sampling_logp_difference/mean": 0.012864516861736775, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1499.2, "completions/max_terminated_length": 1499.2, "completions/mean_length": 1105.99375, "completions/mean_terminated_length": 1105.99375, "completions/min_length": 857.2, "completions/min_terminated_length": 857.2, "entropy": 0.24603658318519592, "epoch": 2.1680376028202115, "frac_reward_zero_std": 0.55, "grad_norm": 0.5251009464263916, "learning_rate": 2.921007996123092e-07, "loss": 0.0014, "num_tokens": 247206918.0, "reward": 0.6041666746139527, "reward_std": 0.09626547321677208, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6041666746139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.38971813917160036, "sampling/importance_sampling_ratio/max": 1.990101408958435, "sampling/importance_sampling_ratio/mean": 0.9999129176139832, "sampling/importance_sampling_ratio/min": 0.37296884059906005, "sampling/sampling_logp_difference/max": 1.1782666206359864, "sampling/sampling_logp_difference/mean": 0.012996607273817063, "step": 1845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1508.2, "completions/max_terminated_length": 1508.2, "completions/mean_length": 1116.371875, "completions/mean_terminated_length": 1116.371875, "completions/min_length": 844.4, "completions/min_terminated_length": 844.4, "entropy": 0.2545777499675751, "epoch": 2.1739130434782608, "frac_reward_zero_std": 0.5, "grad_norm": 0.6853423714637756, "learning_rate": 2.914950327114126e-07, "loss": -0.0086, "num_tokens": 247856413.0, "reward": 0.8486979246139527, "reward_std": 0.11036976650357247, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8486979246139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.250604185461998, "sampling/importance_sampling_ratio/max": 1.9809516191482544, "sampling/importance_sampling_ratio/mean": 0.9998941421508789, "sampling/importance_sampling_ratio/min": 0.20959659069776534, "sampling/sampling_logp_difference/max": 1.7914276361465453, "sampling/sampling_logp_difference/mean": 0.013322827219963074, "step": 1850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1446.4, "completions/max_terminated_length": 1446.4, "completions/mean_length": 1050.86875, "completions/mean_terminated_length": 1050.86875, "completions/min_length": 678.0, "completions/min_terminated_length": 678.0, "entropy": 0.22533740401268004, "epoch": 2.17978848413631, "frac_reward_zero_std": 0.4, "grad_norm": 0.6656336188316345, "learning_rate": 2.908892658105161e-07, "loss": 0.0003, "num_tokens": 248524867.0, "reward": 0.8348958611488342, "reward_std": 0.0947174459695816, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8348958611488342, "rewards/e2e_recall_precision_mixed_reward/std": 0.2426920771598816, "sampling/importance_sampling_ratio/max": 1.9969067335128785, "sampling/importance_sampling_ratio/mean": 1.0000614404678345, "sampling/importance_sampling_ratio/min": 0.3773179233074188, "sampling/sampling_logp_difference/max": 1.1439204216003418, "sampling/sampling_logp_difference/mean": 0.012491510808467865, "step": 1855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1496.4, "completions/max_terminated_length": 1496.4, "completions/mean_length": 1021.7625, "completions/mean_terminated_length": 1021.7625, "completions/min_length": 762.0, "completions/min_terminated_length": 762.0, "entropy": 0.2335586816072464, "epoch": 2.1856639247943597, "frac_reward_zero_std": 0.6, "grad_norm": 0.501934289932251, "learning_rate": 2.9028349890961954e-07, "loss": -0.0001, "num_tokens": 249182711.0, "reward": 0.728906261920929, "reward_std": 0.07847404927015304, "rewards/e2e_recall_precision_mixed_reward/mean": 0.728906261920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.36090049147605896, "sampling/importance_sampling_ratio/max": 1.8162980318069457, "sampling/importance_sampling_ratio/mean": 0.9999779343605042, "sampling/importance_sampling_ratio/min": 0.36308351159095764, "sampling/sampling_logp_difference/max": 1.0326343655586243, "sampling/sampling_logp_difference/mean": 0.012726966850459575, "step": 1860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1422.2, "completions/max_terminated_length": 1422.2, "completions/mean_length": 1061.034375, "completions/mean_terminated_length": 1061.034375, "completions/min_length": 775.2, "completions/min_terminated_length": 775.2, "entropy": 0.23590830862522125, "epoch": 2.191539365452409, "frac_reward_zero_std": 0.55, "grad_norm": 0.39766162633895874, "learning_rate": 2.89677732008723e-07, "loss": 0.0094, "num_tokens": 249855890.0, "reward": 0.7854166746139526, "reward_std": 0.08812462836503983, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7854166746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.32434244602918627, "sampling/importance_sampling_ratio/max": 1.972498369216919, "sampling/importance_sampling_ratio/mean": 0.9998962163925171, "sampling/importance_sampling_ratio/min": 0.3102016121149063, "sampling/sampling_logp_difference/max": 1.3020723462104797, "sampling/sampling_logp_difference/mean": 0.013067025132477284, "step": 1865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1469.4, "completions/max_terminated_length": 1469.4, "completions/mean_length": 1075.846875, "completions/mean_terminated_length": 1075.846875, "completions/min_length": 755.2, "completions/min_terminated_length": 755.2, "entropy": 0.2341611683368683, "epoch": 2.197414806110458, "frac_reward_zero_std": 0.4, "grad_norm": 0.7978026866912842, "learning_rate": 2.890719651078265e-07, "loss": 0.0006, "num_tokens": 250511697.0, "reward": 0.7937500238418579, "reward_std": 0.145812551677227, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7937500238418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.3256185740232468, "sampling/importance_sampling_ratio/max": 1.9232592821121215, "sampling/importance_sampling_ratio/mean": 0.9999657511711121, "sampling/importance_sampling_ratio/min": 0.3169225871562958, "sampling/sampling_logp_difference/max": 1.1892317295074464, "sampling/sampling_logp_difference/mean": 0.012808991968631745, "step": 1870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1369.4, "completions/max_terminated_length": 1369.4, "completions/mean_length": 993.428125, "completions/mean_terminated_length": 993.428125, "completions/min_length": 686.0, "completions/min_terminated_length": 686.0, "entropy": 0.22252206802368163, "epoch": 2.203290246768508, "frac_reward_zero_std": 0.6, "grad_norm": 0.4887593984603882, "learning_rate": 2.8846619820692995e-07, "loss": 0.0004, "num_tokens": 251148570.0, "reward": 0.8028125166893005, "reward_std": 0.07628065943717957, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8028125166893005, "rewards/e2e_recall_precision_mixed_reward/std": 0.22741907089948654, "sampling/importance_sampling_ratio/max": 1.9813610076904298, "sampling/importance_sampling_ratio/mean": 0.9999027252197266, "sampling/importance_sampling_ratio/min": 0.3122305542230606, "sampling/sampling_logp_difference/max": 1.2302313566207885, "sampling/sampling_logp_difference/mean": 0.012417474761605263, "step": 1875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009375, "completions/max_length": 1859.2, "completions/max_terminated_length": 1857.8, "completions/mean_length": 1118.25, "completions/mean_terminated_length": 1105.8674560546874, "completions/min_length": 749.8, "completions/min_terminated_length": 749.8, "entropy": 0.21847314536571502, "epoch": 2.209165687426557, "frac_reward_zero_std": 0.45, "grad_norm": 0.8039262294769287, "learning_rate": 2.8786043130603344e-07, "loss": -0.0113, "num_tokens": 251849278.0, "reward": 0.8166666865348816, "reward_std": 0.13573960661888124, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8166666865348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.23900896161794663, "sampling/importance_sampling_ratio/max": 1.887087082862854, "sampling/importance_sampling_ratio/mean": 0.9999661207199096, "sampling/importance_sampling_ratio/min": 0.3232264846563339, "sampling/sampling_logp_difference/max": 1.1839786291122436, "sampling/sampling_logp_difference/mean": 0.012019017711281776, "step": 1880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1323.8, "completions/max_terminated_length": 1323.8, "completions/mean_length": 988.34375, "completions/mean_terminated_length": 988.34375, "completions/min_length": 726.4, "completions/min_terminated_length": 726.4, "entropy": 0.2140260010957718, "epoch": 2.2150411280846063, "frac_reward_zero_std": 0.7, "grad_norm": 0.655967652797699, "learning_rate": 2.8725466440513693e-07, "loss": -0.0032, "num_tokens": 252451276.0, "reward": 0.884375, "reward_std": 0.05798763036727905, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8843750119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.23959563821554183, "sampling/importance_sampling_ratio/max": 1.9973509311676025, "sampling/importance_sampling_ratio/mean": 0.999974501132965, "sampling/importance_sampling_ratio/min": 0.3508861005679928, "sampling/sampling_logp_difference/max": 5.175508713722229, "sampling/sampling_logp_difference/mean": 0.011773454025387764, "step": 1885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1479.0, "completions/max_terminated_length": 1479.0, "completions/mean_length": 1093.64375, "completions/mean_terminated_length": 1093.64375, "completions/min_length": 754.6, "completions/min_terminated_length": 754.6, "entropy": 0.2178718239068985, "epoch": 2.2209165687426555, "frac_reward_zero_std": 0.6, "grad_norm": 0.7926600575447083, "learning_rate": 2.8664889750424037e-07, "loss": -0.0018, "num_tokens": 253140282.0, "reward": 0.8203125, "reward_std": 0.10103268027305604, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8203125119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.2567353665828705, "sampling/importance_sampling_ratio/max": 1.9315465211868286, "sampling/importance_sampling_ratio/mean": 1.0000144124031067, "sampling/importance_sampling_ratio/min": 0.3091754883527756, "sampling/sampling_logp_difference/max": 1.2065674543380738, "sampling/sampling_logp_difference/mean": 0.012078930810093879, "step": 1890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1514.6, "completions/max_terminated_length": 1514.6, "completions/mean_length": 1054.10625, "completions/mean_terminated_length": 1054.10625, "completions/min_length": 697.4, "completions/min_terminated_length": 697.4, "entropy": 0.23007131218910218, "epoch": 2.226792009400705, "frac_reward_zero_std": 0.5, "grad_norm": 0.6741881966590881, "learning_rate": 2.8604313060334386e-07, "loss": -0.0008, "num_tokens": 253784924.0, "reward": 0.800000011920929, "reward_std": 0.06402734369039535, "rewards/e2e_recall_precision_mixed_reward/mean": 0.800000011920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.26122883558273313, "sampling/importance_sampling_ratio/max": 1.8661206007003783, "sampling/importance_sampling_ratio/mean": 1.0000409960746766, "sampling/importance_sampling_ratio/min": 0.35816158950328825, "sampling/sampling_logp_difference/max": 1.1304970264434815, "sampling/sampling_logp_difference/mean": 0.012475567311048508, "step": 1895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1391.8, "completions/max_terminated_length": 1391.8, "completions/mean_length": 1053.721875, "completions/mean_terminated_length": 1053.721875, "completions/min_length": 827.8, "completions/min_terminated_length": 827.8, "entropy": 0.2240033119916916, "epoch": 2.2326674500587544, "frac_reward_zero_std": 0.4, "grad_norm": 0.6372376680374146, "learning_rate": 2.854373637024473e-07, "loss": 0.0012, "num_tokens": 254444355.0, "reward": 0.8333333373069763, "reward_std": 0.09950350672006607, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8333333373069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.2455697923898697, "sampling/importance_sampling_ratio/max": 1.9715130090713502, "sampling/importance_sampling_ratio/mean": 0.9999606847763062, "sampling/importance_sampling_ratio/min": 0.32975890338420866, "sampling/sampling_logp_difference/max": 1.141264510154724, "sampling/sampling_logp_difference/mean": 0.012219694256782532, "step": 1900 }, { "epoch": 2.2326674500587544, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.000625, "eval_completions/max_length": 1452.08, "eval_completions/max_terminated_length": 1445.28, "eval_completions/mean_length": 1023.909375, "eval_completions/mean_terminated_length": 1023.0136328125, "eval_completions/min_length": 751.44, "eval_completions/min_terminated_length": 751.44, "eval_entropy": 0.23475720524787902, "eval_frac_reward_zero_std": 0.53, "eval_loss": -0.0010693141957744956, "eval_num_tokens": 254444355.0, "eval_reward": 0.7352083444595336, "eval_reward_std": 0.09326890490949154, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7352083444595336, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.30484940618276596, "eval_runtime": 394.069, "eval_samples_per_second": 0.254, "eval_sampling/importance_sampling_ratio/max": 1.9632924747467042, "eval_sampling/importance_sampling_ratio/mean": 0.9999413275718689, "eval_sampling/importance_sampling_ratio/min": 0.31770970672369003, "eval_sampling/sampling_logp_difference/max": 1.298856236934662, "eval_sampling/sampling_logp_difference/mean": 0.012761585600674152, "eval_steps_per_second": 0.005, "step": 1900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1544.0, "completions/max_terminated_length": 1544.0, "completions/mean_length": 1024.86875, "completions/mean_terminated_length": 1024.86875, "completions/min_length": 784.0, "completions/min_terminated_length": 784.0, "entropy": 0.24251948595046996, "epoch": 2.2385428907168037, "frac_reward_zero_std": 0.5, "grad_norm": 0.6644178628921509, "learning_rate": 2.848315968015508e-07, "loss": 0.0004, "num_tokens": 255080473.0, "reward": 0.857812511920929, "reward_std": 0.08356589004397393, "rewards/e2e_recall_precision_mixed_reward/mean": 0.857812511920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.24869934916496278, "sampling/importance_sampling_ratio/max": 1.9640849590301515, "sampling/importance_sampling_ratio/mean": 0.9999147415161133, "sampling/importance_sampling_ratio/min": 0.31352718770503996, "sampling/sampling_logp_difference/max": 1.1885489702224732, "sampling/sampling_logp_difference/mean": 0.01301195491105318, "step": 1905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 1718.2, "completions/max_terminated_length": 1665.8, "completions/mean_length": 1055.325, "completions/mean_terminated_length": 1046.8310302734376, "completions/min_length": 715.4, "completions/min_terminated_length": 715.4, "entropy": 0.22508153021335603, "epoch": 2.244418331374853, "frac_reward_zero_std": 0.55, "grad_norm": 0.37735798954963684, "learning_rate": 2.8422582990065416e-07, "loss": -0.0044, "num_tokens": 255740601.0, "reward": 0.7631250143051147, "reward_std": 0.0876377984881401, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7631250143051147, "rewards/e2e_recall_precision_mixed_reward/std": 0.2765495449304581, "sampling/importance_sampling_ratio/max": 1.9813214778900146, "sampling/importance_sampling_ratio/mean": 0.9998460412025452, "sampling/importance_sampling_ratio/min": 0.36637800335884096, "sampling/sampling_logp_difference/max": 1.1442196130752564, "sampling/sampling_logp_difference/mean": 0.012507494539022446, "step": 1910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1471.6, "completions/max_terminated_length": 1471.6, "completions/mean_length": 1069.365625, "completions/mean_terminated_length": 1069.365625, "completions/min_length": 731.2, "completions/min_terminated_length": 731.2, "entropy": 0.23527201116085053, "epoch": 2.2502937720329026, "frac_reward_zero_std": 0.5, "grad_norm": 0.4593348801136017, "learning_rate": 2.8362006299975765e-07, "loss": 0.0089, "num_tokens": 256408926.0, "reward": 0.776562511920929, "reward_std": 0.12371757328510284, "rewards/e2e_recall_precision_mixed_reward/mean": 0.776562511920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.28311349302530286, "sampling/importance_sampling_ratio/max": 1.9783401966094971, "sampling/importance_sampling_ratio/mean": 1.0000746846199036, "sampling/importance_sampling_ratio/min": 0.2895603716373444, "sampling/sampling_logp_difference/max": 1.3389286994934082, "sampling/sampling_logp_difference/mean": 0.012912764959037305, "step": 1915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1559.0, "completions/max_terminated_length": 1559.0, "completions/mean_length": 1112.6125, "completions/mean_terminated_length": 1112.6125, "completions/min_length": 776.2, "completions/min_terminated_length": 776.2, "entropy": 0.23903321325778962, "epoch": 2.256169212690952, "frac_reward_zero_std": 0.5, "grad_norm": 0.4446127116680145, "learning_rate": 2.8301429609886114e-07, "loss": 0.0022, "num_tokens": 257103634.0, "reward": 0.8364583611488342, "reward_std": 0.1059862032532692, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8364583611488342, "rewards/e2e_recall_precision_mixed_reward/std": 0.258354589343071, "sampling/importance_sampling_ratio/max": 1.9829919576644897, "sampling/importance_sampling_ratio/mean": 0.9999783992767334, "sampling/importance_sampling_ratio/min": 0.2826462507247925, "sampling/sampling_logp_difference/max": 1.4443536758422852, "sampling/sampling_logp_difference/mean": 0.013122853077948093, "step": 1920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1565.4, "completions/max_terminated_length": 1565.4, "completions/mean_length": 1123.990625, "completions/mean_terminated_length": 1123.990625, "completions/min_length": 871.4, "completions/min_terminated_length": 871.4, "entropy": 0.22474170625209808, "epoch": 2.262044653349001, "frac_reward_zero_std": 0.65, "grad_norm": 0.4290720820426941, "learning_rate": 2.824085291979646e-07, "loss": 0.0004, "num_tokens": 257766399.0, "reward": 0.8651041746139526, "reward_std": 0.061245692521333696, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8651041746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.25154909715056417, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001497983932495, "sampling/importance_sampling_ratio/min": 0.2948518693447113, "sampling/sampling_logp_difference/max": 1.2655499935150147, "sampling/sampling_logp_difference/mean": 0.01205264199525118, "step": 1925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1498.0, "completions/max_terminated_length": 1498.0, "completions/mean_length": 1148.1375, "completions/mean_terminated_length": 1148.1375, "completions/min_length": 848.4, "completions/min_terminated_length": 848.4, "entropy": 0.239599347114563, "epoch": 2.2679200940070503, "frac_reward_zero_std": 0.4, "grad_norm": 0.6222954392433167, "learning_rate": 2.8180276229706807e-07, "loss": -0.0026, "num_tokens": 258515195.0, "reward": 0.7656250059604645, "reward_std": 0.09615588523447513, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7656250059604645, "rewards/e2e_recall_precision_mixed_reward/std": 0.2964589536190033, "sampling/importance_sampling_ratio/max": 1.9878407716751099, "sampling/importance_sampling_ratio/mean": 1.0000786304473877, "sampling/importance_sampling_ratio/min": 0.25952497124671936, "sampling/sampling_logp_difference/max": 1.3694255113601685, "sampling/sampling_logp_difference/mean": 0.013026346825063229, "step": 1930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1531.2, "completions/max_terminated_length": 1531.2, "completions/mean_length": 1101.3125, "completions/mean_terminated_length": 1101.3125, "completions/min_length": 838.2, "completions/min_terminated_length": 838.2, "entropy": 0.24195704162120818, "epoch": 2.2737955346651, "frac_reward_zero_std": 0.45, "grad_norm": 0.7305625677108765, "learning_rate": 2.8119699539617156e-07, "loss": -0.004, "num_tokens": 259177519.0, "reward": 0.8244270920753479, "reward_std": 0.09137791246175767, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8244270920753479, "rewards/e2e_recall_precision_mixed_reward/std": 0.22105640172958374, "sampling/importance_sampling_ratio/max": 1.9939350605010986, "sampling/importance_sampling_ratio/mean": 0.9998633027076721, "sampling/importance_sampling_ratio/min": 0.3458234578371048, "sampling/sampling_logp_difference/max": 1.3980162143707275, "sampling/sampling_logp_difference/mean": 0.013009026646614075, "step": 1935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1313.6, "completions/max_terminated_length": 1313.6, "completions/mean_length": 983.740625, "completions/mean_terminated_length": 983.740625, "completions/min_length": 751.2, "completions/min_terminated_length": 751.2, "entropy": 0.2223033517599106, "epoch": 2.279670975323149, "frac_reward_zero_std": 0.65, "grad_norm": 0.4795616567134857, "learning_rate": 2.80591228495275e-07, "loss": -0.001, "num_tokens": 259834556.0, "reward": 0.7640625178813935, "reward_std": 0.08006698191165924, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7640625178813935, "rewards/e2e_recall_precision_mixed_reward/std": 0.3285771101713181, "sampling/importance_sampling_ratio/max": 1.9821281909942627, "sampling/importance_sampling_ratio/mean": 1.0000587821006774, "sampling/importance_sampling_ratio/min": 0.41876710653305055, "sampling/sampling_logp_difference/max": 0.8904253721237183, "sampling/sampling_logp_difference/mean": 0.01218780390918255, "step": 1940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1554.6, "completions/max_terminated_length": 1554.6, "completions/mean_length": 1059.265625, "completions/mean_terminated_length": 1059.265625, "completions/min_length": 740.6, "completions/min_terminated_length": 740.6, "entropy": 0.24593849778175353, "epoch": 2.2855464159811985, "frac_reward_zero_std": 0.4, "grad_norm": 0.9010393619537354, "learning_rate": 2.799854615943785e-07, "loss": 0.0023, "num_tokens": 260492209.0, "reward": 0.7838541865348816, "reward_std": 0.14079142212867737, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7838541865348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.301932692527771, "sampling/importance_sampling_ratio/max": 1.9539946079254151, "sampling/importance_sampling_ratio/mean": 1.0001062393188476, "sampling/importance_sampling_ratio/min": 0.37328195571899414, "sampling/sampling_logp_difference/max": 1.0297700881958007, "sampling/sampling_logp_difference/mean": 0.013037090376019477, "step": 1945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1615.0, "completions/max_terminated_length": 1615.0, "completions/mean_length": 1110.025, "completions/mean_terminated_length": 1110.025, "completions/min_length": 782.0, "completions/min_terminated_length": 782.0, "entropy": 0.23949267864227294, "epoch": 2.291421856639248, "frac_reward_zero_std": 0.55, "grad_norm": 0.5884570479393005, "learning_rate": 2.793796946934819e-07, "loss": 0.0058, "num_tokens": 261155113.0, "reward": 0.8239583492279052, "reward_std": 0.07468674443662167, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8239583492279052, "rewards/e2e_recall_precision_mixed_reward/std": 0.2608910098671913, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999876737594604, "sampling/importance_sampling_ratio/min": 0.350416773557663, "sampling/sampling_logp_difference/max": 1.0721632480621337, "sampling/sampling_logp_difference/mean": 0.012836766429245472, "step": 1950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1462.6, "completions/max_terminated_length": 1462.6, "completions/mean_length": 1101.3125, "completions/mean_terminated_length": 1101.3125, "completions/min_length": 853.8, "completions/min_terminated_length": 853.8, "entropy": 0.24034703075885772, "epoch": 2.2972972972972974, "frac_reward_zero_std": 0.75, "grad_norm": 0.44212859869003296, "learning_rate": 2.787739277925854e-07, "loss": 0.0027, "num_tokens": 261805885.0, "reward": 0.7859375, "reward_std": 0.05160985812544823, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7859375, "rewards/e2e_recall_precision_mixed_reward/std": 0.3121504485607147, "sampling/importance_sampling_ratio/max": 1.9134503841400146, "sampling/importance_sampling_ratio/mean": 0.9998910069465637, "sampling/importance_sampling_ratio/min": 0.36197959780693056, "sampling/sampling_logp_difference/max": 1.118364405632019, "sampling/sampling_logp_difference/mean": 0.012630455754697322, "step": 1955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1554.6, "completions/max_terminated_length": 1554.6, "completions/mean_length": 1110.775, "completions/mean_terminated_length": 1110.775, "completions/min_length": 830.0, "completions/min_terminated_length": 830.0, "entropy": 0.24084980189800262, "epoch": 2.3031727379553466, "frac_reward_zero_std": 0.5, "grad_norm": 0.8059061765670776, "learning_rate": 2.781681608916889e-07, "loss": -0.0017, "num_tokens": 262485749.0, "reward": 0.7385416746139526, "reward_std": 0.09334568008780479, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7385416746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.3025936484336853, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000214099884033, "sampling/importance_sampling_ratio/min": 0.3065564423799515, "sampling/sampling_logp_difference/max": 1.4883403539657594, "sampling/sampling_logp_difference/mean": 0.013054091669619083, "step": 1960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1720.4, "completions/max_terminated_length": 1720.4, "completions/mean_length": 1106.85625, "completions/mean_terminated_length": 1106.85625, "completions/min_length": 745.6, "completions/min_terminated_length": 745.6, "entropy": 0.2630400389432907, "epoch": 2.309048178613396, "frac_reward_zero_std": 0.75, "grad_norm": 0.0, "learning_rate": 2.7756239399079234e-07, "loss": 0.0032, "num_tokens": 263183159.0, "reward": 0.7591145873069763, "reward_std": 0.04956208989024162, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7591145873069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.33793588876724245, "sampling/importance_sampling_ratio/max": 1.9396537780761718, "sampling/importance_sampling_ratio/mean": 0.9998874068260193, "sampling/importance_sampling_ratio/min": 0.40306171774864197, "sampling/sampling_logp_difference/max": 0.9383793830871582, "sampling/sampling_logp_difference/mean": 0.013850330747663975, "step": 1965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1515.6, "completions/max_terminated_length": 1515.6, "completions/mean_length": 1097.1875, "completions/mean_terminated_length": 1097.1875, "completions/min_length": 781.4, "completions/min_terminated_length": 781.4, "entropy": 0.2332218050956726, "epoch": 2.3149236192714455, "frac_reward_zero_std": 0.45, "grad_norm": 0.8961458802223206, "learning_rate": 2.769566270898958e-07, "loss": 0.0008, "num_tokens": 263877299.0, "reward": 0.8614583611488342, "reward_std": 0.12068969756364822, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8614583611488342, "rewards/e2e_recall_precision_mixed_reward/std": 0.24193784296512605, "sampling/importance_sampling_ratio/max": 1.9547238111495973, "sampling/importance_sampling_ratio/mean": 0.999898374080658, "sampling/importance_sampling_ratio/min": 0.4016565144062042, "sampling/sampling_logp_difference/max": 0.9715569138526916, "sampling/sampling_logp_difference/mean": 0.012774857692420483, "step": 1970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1533.0, "completions/max_terminated_length": 1533.0, "completions/mean_length": 1126.8375, "completions/mean_terminated_length": 1126.8375, "completions/min_length": 795.8, "completions/min_terminated_length": 795.8, "entropy": 0.2287152588367462, "epoch": 2.3207990599294948, "frac_reward_zero_std": 0.55, "grad_norm": 0.3858145773410797, "learning_rate": 2.7635086018899926e-07, "loss": 0.0003, "num_tokens": 264564719.0, "reward": 0.8640625238418579, "reward_std": 0.08302248492836953, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8640625238418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.20304519385099412, "sampling/importance_sampling_ratio/max": 1.984503149986267, "sampling/importance_sampling_ratio/mean": 0.9999793767929077, "sampling/importance_sampling_ratio/min": 0.39244469404220583, "sampling/sampling_logp_difference/max": 1.0014652013778687, "sampling/sampling_logp_difference/mean": 0.012451635673642159, "step": 1975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.028125, "completions/max_length": 1970.2, "completions/max_terminated_length": 1945.0, "completions/mean_length": 1229.9, "completions/mean_terminated_length": 1199.3093505859374, "completions/min_length": 792.4, "completions/min_terminated_length": 792.4, "entropy": 0.22849614918231964, "epoch": 2.326674500587544, "frac_reward_zero_std": 0.55, "grad_norm": 0.5069258213043213, "learning_rate": 2.7574509328810275e-07, "loss": -0.0273, "num_tokens": 265271995.0, "reward": 0.7555208563804626, "reward_std": 0.0813896507024765, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7555208563804626, "rewards/e2e_recall_precision_mixed_reward/std": 0.34186806380748747, "sampling/importance_sampling_ratio/max": 1.9778911828994752, "sampling/importance_sampling_ratio/mean": 0.9999969363212585, "sampling/importance_sampling_ratio/min": 0.27652696073055266, "sampling/sampling_logp_difference/max": 1.351327657699585, "sampling/sampling_logp_difference/mean": 0.012531153298914433, "step": 1980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1454.6, "completions/max_terminated_length": 1454.6, "completions/mean_length": 1057.096875, "completions/mean_terminated_length": 1057.096875, "completions/min_length": 812.0, "completions/min_terminated_length": 812.0, "entropy": 0.22988880872726442, "epoch": 2.3325499412455932, "frac_reward_zero_std": 0.5, "grad_norm": 0.6030939817428589, "learning_rate": 2.7513932638720624e-07, "loss": 0.0031, "num_tokens": 265935338.0, "reward": 0.782812523841858, "reward_std": 0.09311963804066181, "rewards/e2e_recall_precision_mixed_reward/mean": 0.782812523841858, "rewards/e2e_recall_precision_mixed_reward/std": 0.3007162183523178, "sampling/importance_sampling_ratio/max": 1.9878717422485352, "sampling/importance_sampling_ratio/mean": 1.0000318169593811, "sampling/importance_sampling_ratio/min": 0.26722107380628585, "sampling/sampling_logp_difference/max": 1.5060338497161865, "sampling/sampling_logp_difference/mean": 0.012719903513789177, "step": 1985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1966.2, "completions/max_terminated_length": 1966.2, "completions/mean_length": 1223.465625, "completions/mean_terminated_length": 1223.465625, "completions/min_length": 884.8, "completions/min_terminated_length": 884.8, "entropy": 0.2435948759317398, "epoch": 2.338425381903643, "frac_reward_zero_std": 0.5, "grad_norm": 0.0, "learning_rate": 2.745335594863096e-07, "loss": -0.0007, "num_tokens": 266639487.0, "reward": 0.838281261920929, "reward_std": 0.09026078432798386, "rewards/e2e_recall_precision_mixed_reward/mean": 0.838281261920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.2680289790034294, "sampling/importance_sampling_ratio/max": 1.9598480701446532, "sampling/importance_sampling_ratio/mean": 0.9998942852020264, "sampling/importance_sampling_ratio/min": 0.2704477931372821, "sampling/sampling_logp_difference/max": 2.0783373355865478, "sampling/sampling_logp_difference/mean": 0.013020814210176469, "step": 1990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1626.4, "completions/max_terminated_length": 1626.4, "completions/mean_length": 1127.540625, "completions/mean_terminated_length": 1127.540625, "completions/min_length": 709.0, "completions/min_terminated_length": 709.0, "entropy": 0.24840072691440582, "epoch": 2.344300822561692, "frac_reward_zero_std": 0.6, "grad_norm": 0.5901416540145874, "learning_rate": 2.739277925854131e-07, "loss": 0.0035, "num_tokens": 267330828.0, "reward": 0.7151041865348816, "reward_std": 0.08581260442733765, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7151041865348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.3605960875749588, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001015663146973, "sampling/importance_sampling_ratio/min": 0.42087502479553224, "sampling/sampling_logp_difference/max": 1.0152322053909302, "sampling/sampling_logp_difference/mean": 0.013321847841143607, "step": 1995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 1965.6, "completions/max_terminated_length": 1852.4, "completions/mean_length": 1199.00625, "completions/mean_terminated_length": 1191.6684326171876, "completions/min_length": 788.4, "completions/min_terminated_length": 788.4, "entropy": 0.2513920724391937, "epoch": 2.3501762632197414, "frac_reward_zero_std": 0.6, "grad_norm": 0.5038850903511047, "learning_rate": 2.7332202568451655e-07, "loss": -0.0232, "num_tokens": 268033542.0, "reward": 0.8557291746139526, "reward_std": 0.07250104248523712, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8557291746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.21458423137664795, "sampling/importance_sampling_ratio/max": 1.9932072162628174, "sampling/importance_sampling_ratio/mean": 1.0000502467155457, "sampling/importance_sampling_ratio/min": 0.37983145117759703, "sampling/sampling_logp_difference/max": 1.0120579957962037, "sampling/sampling_logp_difference/mean": 0.013376428000628948, "step": 2000 }, { "epoch": 2.3501762632197414, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1608.16, "eval_completions/max_terminated_length": 1608.16, "eval_completions/mean_length": 1127.00875, "eval_completions/mean_terminated_length": 1127.00875, "eval_completions/min_length": 836.88, "eval_completions/min_terminated_length": 836.88, "eval_entropy": 0.25097706019878385, "eval_frac_reward_zero_std": 0.54, "eval_loss": 0.0010900250636041164, "eval_num_tokens": 268033542.0, "eval_reward": 0.7470312607288361, "eval_reward_std": 0.08885785259306431, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7470312631130218, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.30599226742982866, "eval_runtime": 439.0898, "eval_samples_per_second": 0.228, "eval_sampling/importance_sampling_ratio/max": 1.9520465230941773, "eval_sampling/importance_sampling_ratio/mean": 0.9999663019180298, "eval_sampling/importance_sampling_ratio/min": 0.33184033348748926, "eval_sampling/sampling_logp_difference/max": 1.5774301075935364, "eval_sampling/sampling_logp_difference/mean": 0.013306757099926472, "eval_steps_per_second": 0.005, "step": 2000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1531.6, "completions/max_terminated_length": 1531.6, "completions/mean_length": 1145.7625, "completions/mean_terminated_length": 1145.7625, "completions/min_length": 822.0, "completions/min_terminated_length": 822.0, "entropy": 0.2629284977912903, "epoch": 2.3560517038777906, "frac_reward_zero_std": 0.5, "grad_norm": 0.6058534383773804, "learning_rate": 2.7271625878362004e-07, "loss": 0.0053, "num_tokens": 268706202.0, "reward": 0.7458333492279052, "reward_std": 0.08412181735038757, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7458333492279052, "rewards/e2e_recall_precision_mixed_reward/std": 0.28866009414196014, "sampling/importance_sampling_ratio/max": 1.9702019453048707, "sampling/importance_sampling_ratio/mean": 1.0000141739845276, "sampling/importance_sampling_ratio/min": 0.2838461309671402, "sampling/sampling_logp_difference/max": 1.3169232606887817, "sampling/sampling_logp_difference/mean": 0.013618256896734238, "step": 2005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1781.0, "completions/max_terminated_length": 1781.0, "completions/mean_length": 1223.86875, "completions/mean_terminated_length": 1223.86875, "completions/min_length": 888.2, "completions/min_terminated_length": 888.2, "entropy": 0.24649737477302552, "epoch": 2.3619271445358403, "frac_reward_zero_std": 0.55, "grad_norm": 0.6726218461990356, "learning_rate": 2.7211049188272353e-07, "loss": -0.0022, "num_tokens": 269407984.0, "reward": 0.8861979365348815, "reward_std": 0.0921474851667881, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8861979365348815, "rewards/e2e_recall_precision_mixed_reward/std": 0.22615295350551606, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998706579208374, "sampling/importance_sampling_ratio/min": 0.2436885952949721, "sampling/sampling_logp_difference/max": 7.224675178527832, "sampling/sampling_logp_difference/mean": 0.013193446025252343, "step": 2010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1376.2, "completions/max_terminated_length": 1376.2, "completions/mean_length": 1066.721875, "completions/mean_terminated_length": 1066.721875, "completions/min_length": 788.0, "completions/min_terminated_length": 788.0, "entropy": 0.23765726387500763, "epoch": 2.3678025851938895, "frac_reward_zero_std": 0.7, "grad_norm": 0.8555007576942444, "learning_rate": 2.7150472498182696e-07, "loss": -0.0028, "num_tokens": 270033255.0, "reward": 0.9101562619209289, "reward_std": 0.07478788420557976, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9101562619209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.13179679438471795, "sampling/importance_sampling_ratio/max": 1.9186473608016967, "sampling/importance_sampling_ratio/mean": 0.9999523162841797, "sampling/importance_sampling_ratio/min": 0.3806654095649719, "sampling/sampling_logp_difference/max": 1.0193307399749756, "sampling/sampling_logp_difference/mean": 0.012833572551608085, "step": 2015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1591.8, "completions/max_terminated_length": 1591.8, "completions/mean_length": 1153.6875, "completions/mean_terminated_length": 1153.6875, "completions/min_length": 900.2, "completions/min_terminated_length": 900.2, "entropy": 0.2594814360141754, "epoch": 2.3736780258519388, "frac_reward_zero_std": 0.5, "grad_norm": 0.44845935702323914, "learning_rate": 2.7089895808093045e-07, "loss": 0.0057, "num_tokens": 270722227.0, "reward": 0.8557291984558105, "reward_std": 0.11051277071237564, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8557291984558105, "rewards/e2e_recall_precision_mixed_reward/std": 0.2391131788492203, "sampling/importance_sampling_ratio/max": 1.9824217557907104, "sampling/importance_sampling_ratio/mean": 0.9999794125556946, "sampling/importance_sampling_ratio/min": 0.3954480618238449, "sampling/sampling_logp_difference/max": 1.0284236431121827, "sampling/sampling_logp_difference/mean": 0.013628228195011615, "step": 2020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1617.8, "completions/max_terminated_length": 1617.8, "completions/mean_length": 1185.45, "completions/mean_terminated_length": 1185.45, "completions/min_length": 843.6, "completions/min_terminated_length": 843.6, "entropy": 0.2511540025472641, "epoch": 2.3795534665099884, "frac_reward_zero_std": 0.6, "grad_norm": 0.442772775888443, "learning_rate": 2.702931911800339e-07, "loss": 0.0026, "num_tokens": 271446675.0, "reward": 0.8489583492279053, "reward_std": 0.08587736189365387, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8489583492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.23618671298027039, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000007688999176, "sampling/importance_sampling_ratio/min": 0.1813358840532601, "sampling/sampling_logp_difference/max": 2.310008430480957, "sampling/sampling_logp_difference/mean": 0.013332589715719222, "step": 2025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1752.6, "completions/max_terminated_length": 1752.6, "completions/mean_length": 1194.28125, "completions/mean_terminated_length": 1194.28125, "completions/min_length": 903.8, "completions/min_terminated_length": 903.8, "entropy": 0.25064473152160643, "epoch": 2.3854289071680377, "frac_reward_zero_std": 0.55, "grad_norm": 0.45136576890945435, "learning_rate": 2.696874242791374e-07, "loss": -0.0014, "num_tokens": 272145981.0, "reward": 0.7092187583446503, "reward_std": 0.10091326609253884, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7092187583446503, "rewards/e2e_recall_precision_mixed_reward/std": 0.2888171553611755, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999277353286743, "sampling/importance_sampling_ratio/min": 0.2797494070604444, "sampling/sampling_logp_difference/max": 2.1815228700637816, "sampling/sampling_logp_difference/mean": 0.013193762302398682, "step": 2030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1767.8, "completions/max_terminated_length": 1767.8, "completions/mean_length": 1246.7625, "completions/mean_terminated_length": 1246.7625, "completions/min_length": 885.8, "completions/min_terminated_length": 885.8, "entropy": 0.2690925747156143, "epoch": 2.391304347826087, "frac_reward_zero_std": 0.7, "grad_norm": 0.6555709838867188, "learning_rate": 2.6908165737824087e-07, "loss": 0.0005, "num_tokens": 272869025.0, "reward": 0.878125011920929, "reward_std": 0.05481426417827606, "rewards/e2e_recall_precision_mixed_reward/mean": 0.878125011920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.20230115056037903, "sampling/importance_sampling_ratio/max": 1.9401631832122803, "sampling/importance_sampling_ratio/mean": 0.9999709963798523, "sampling/importance_sampling_ratio/min": 0.2493190795183533, "sampling/sampling_logp_difference/max": 6.924089002609253, "sampling/sampling_logp_difference/mean": 0.013956866040825845, "step": 2035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1756.0, "completions/max_terminated_length": 1756.0, "completions/mean_length": 1192.321875, "completions/mean_terminated_length": 1192.321875, "completions/min_length": 823.6, "completions/min_terminated_length": 823.6, "entropy": 0.25292410254478453, "epoch": 2.397179788484136, "frac_reward_zero_std": 0.6, "grad_norm": 0.0, "learning_rate": 2.684758904773443e-07, "loss": 0.0022, "num_tokens": 273569704.0, "reward": 0.9161458492279053, "reward_std": 0.08007604032754898, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9161458492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.17229452580213547, "sampling/importance_sampling_ratio/max": 1.9493489265441895, "sampling/importance_sampling_ratio/mean": 0.999928891658783, "sampling/importance_sampling_ratio/min": 0.3117185816168785, "sampling/sampling_logp_difference/max": 1.3051450490951537, "sampling/sampling_logp_difference/mean": 0.013440205156803131, "step": 2040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1764.6, "completions/max_terminated_length": 1764.6, "completions/mean_length": 1189.453125, "completions/mean_terminated_length": 1189.453125, "completions/min_length": 814.0, "completions/min_terminated_length": 814.0, "entropy": 0.2691080868244171, "epoch": 2.403055229142186, "frac_reward_zero_std": 0.65, "grad_norm": 0.7951878905296326, "learning_rate": 2.678701235764478e-07, "loss": 0.0033, "num_tokens": 274263145.0, "reward": 0.8606770873069763, "reward_std": 0.06981215178966522, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8606770873069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.2001673936843872, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000786066055298, "sampling/importance_sampling_ratio/min": 0.2740499794483185, "sampling/sampling_logp_difference/max": 1.6949212074279785, "sampling/sampling_logp_difference/mean": 0.013879082910716534, "step": 2045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1926.8, "completions/max_terminated_length": 1926.8, "completions/mean_length": 1224.5125, "completions/mean_terminated_length": 1224.5125, "completions/min_length": 907.0, "completions/min_terminated_length": 907.0, "entropy": 0.27390618324279786, "epoch": 2.408930669800235, "frac_reward_zero_std": 0.6, "grad_norm": 0.4256892204284668, "learning_rate": 2.6726435667555123e-07, "loss": 0.0096, "num_tokens": 275021357.0, "reward": 0.911718773841858, "reward_std": 0.07080408632755279, "rewards/e2e_recall_precision_mixed_reward/mean": 0.911718773841858, "rewards/e2e_recall_precision_mixed_reward/std": 0.1760246217250824, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999222755432129, "sampling/importance_sampling_ratio/min": 0.3092021256685257, "sampling/sampling_logp_difference/max": 1.2760996580123902, "sampling/sampling_logp_difference/mean": 0.01414750088006258, "step": 2050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1639.6, "completions/max_terminated_length": 1639.6, "completions/mean_length": 1126.825, "completions/mean_terminated_length": 1126.825, "completions/min_length": 749.2, "completions/min_terminated_length": 749.2, "entropy": 0.2581260442733765, "epoch": 2.4148061104582843, "frac_reward_zero_std": 0.65, "grad_norm": 0.45884257555007935, "learning_rate": 2.666585897746547e-07, "loss": 0.004, "num_tokens": 275683397.0, "reward": 0.8592708349227905, "reward_std": 0.05926808714866638, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8592708349227905, "rewards/e2e_recall_precision_mixed_reward/std": 0.23570542484521867, "sampling/importance_sampling_ratio/max": 1.9564772129058838, "sampling/importance_sampling_ratio/mean": 0.9999432563781738, "sampling/importance_sampling_ratio/min": 0.332023561000824, "sampling/sampling_logp_difference/max": 1.158114504814148, "sampling/sampling_logp_difference/mean": 0.013491682521998883, "step": 2055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1776.4, "completions/max_terminated_length": 1776.4, "completions/mean_length": 1245.8375, "completions/mean_terminated_length": 1245.8375, "completions/min_length": 906.2, "completions/min_terminated_length": 906.2, "entropy": 0.26880968511104586, "epoch": 2.4206815511163335, "frac_reward_zero_std": 0.7, "grad_norm": 0.5789676904678345, "learning_rate": 2.660528228737582e-07, "loss": 0.005, "num_tokens": 276430721.0, "reward": 0.8739583373069764, "reward_std": 0.06670975238084793, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8739583373069764, "rewards/e2e_recall_precision_mixed_reward/std": 0.23361384719610215, "sampling/importance_sampling_ratio/max": 1.954941201210022, "sampling/importance_sampling_ratio/mean": 1.000061297416687, "sampling/importance_sampling_ratio/min": 0.31866523027420046, "sampling/sampling_logp_difference/max": 1.3414917469024659, "sampling/sampling_logp_difference/mean": 0.013910824991762638, "step": 2060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1777.4, "completions/max_terminated_length": 1777.4, "completions/mean_length": 1222.33125, "completions/mean_terminated_length": 1222.33125, "completions/min_length": 792.4, "completions/min_terminated_length": 792.4, "entropy": 0.27435516715049746, "epoch": 2.426556991774383, "frac_reward_zero_std": 0.45, "grad_norm": 0.47820526361465454, "learning_rate": 2.6544705597286165e-07, "loss": 0.0092, "num_tokens": 277175051.0, "reward": 0.8295312643051147, "reward_std": 0.11600432693958282, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8295312643051147, "rewards/e2e_recall_precision_mixed_reward/std": 0.28728988468647004, "sampling/importance_sampling_ratio/max": 1.9975449800491334, "sampling/importance_sampling_ratio/mean": 0.9999809026718139, "sampling/importance_sampling_ratio/min": 0.3298905849456787, "sampling/sampling_logp_difference/max": 1.2696374893188476, "sampling/sampling_logp_difference/mean": 0.014326155558228493, "step": 2065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1779.2, "completions/max_terminated_length": 1779.2, "completions/mean_length": 1233.875, "completions/mean_terminated_length": 1233.875, "completions/min_length": 863.8, "completions/min_terminated_length": 863.8, "entropy": 0.26715349555015566, "epoch": 2.4324324324324325, "frac_reward_zero_std": 0.65, "grad_norm": 0.46884390711784363, "learning_rate": 2.648412890719651e-07, "loss": -0.0021, "num_tokens": 277894115.0, "reward": 0.8973958373069764, "reward_std": 0.06307865604758263, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8973958492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.135838021337986, "sampling/importance_sampling_ratio/max": 1.9918049335479737, "sampling/importance_sampling_ratio/mean": 1.0000259518623351, "sampling/importance_sampling_ratio/min": 0.2770086288452148, "sampling/sampling_logp_difference/max": 1.4482748746871947, "sampling/sampling_logp_difference/mean": 0.013979560136795044, "step": 2070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 2042.6, "completions/max_terminated_length": 1876.8, "completions/mean_length": 1274.6, "completions/mean_terminated_length": 1267.1865966796875, "completions/min_length": 889.0, "completions/min_terminated_length": 889.0, "entropy": 0.26166120171546936, "epoch": 2.4383078730904817, "frac_reward_zero_std": 0.65, "grad_norm": 0.5546722412109375, "learning_rate": 2.642355221710685e-07, "loss": -0.0208, "num_tokens": 278637739.0, "reward": 0.8460416674613953, "reward_std": 0.07623862028121949, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8460416674613953, "rewards/e2e_recall_precision_mixed_reward/std": 0.22280243337154387, "sampling/importance_sampling_ratio/max": 1.9055000066757202, "sampling/importance_sampling_ratio/mean": 0.9998693823814392, "sampling/importance_sampling_ratio/min": 0.3392042249441147, "sampling/sampling_logp_difference/max": 1.152932047843933, "sampling/sampling_logp_difference/mean": 0.013772268965840339, "step": 2075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1897.2, "completions/max_terminated_length": 1842.2, "completions/mean_length": 1271.5375, "completions/mean_terminated_length": 1267.7753173828125, "completions/min_length": 902.8, "completions/min_terminated_length": 902.8, "entropy": 0.2623803198337555, "epoch": 2.444183313748531, "frac_reward_zero_std": 0.35, "grad_norm": 0.679646372795105, "learning_rate": 2.63629755270172e-07, "loss": -0.003, "num_tokens": 279384675.0, "reward": 0.6657291650772095, "reward_std": 0.14221025630831718, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6657291650772095, "rewards/e2e_recall_precision_mixed_reward/std": 0.38955762386322024, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999815106391907, "sampling/importance_sampling_ratio/min": 0.3362751841545105, "sampling/sampling_logp_difference/max": 1.1838315725326538, "sampling/sampling_logp_difference/mean": 0.013782840594649315, "step": 2080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1718.2, "completions/max_terminated_length": 1718.2, "completions/mean_length": 1200.925, "completions/mean_terminated_length": 1200.925, "completions/min_length": 834.0, "completions/min_terminated_length": 834.0, "entropy": 0.26449680924415586, "epoch": 2.4500587544065806, "frac_reward_zero_std": 0.65, "grad_norm": 0.6856433749198914, "learning_rate": 2.630239883692755e-07, "loss": -0.0049, "num_tokens": 280092555.0, "reward": 0.8588541746139526, "reward_std": 0.08078035041689872, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8588541746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.17709082067012788, "sampling/importance_sampling_ratio/max": 1.936391544342041, "sampling/importance_sampling_ratio/mean": 1.0000238418579102, "sampling/importance_sampling_ratio/min": 0.3401082783937454, "sampling/sampling_logp_difference/max": 1.2312336444854737, "sampling/sampling_logp_difference/mean": 0.013856485113501548, "step": 2085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1737.6, "completions/max_terminated_length": 1737.6, "completions/mean_length": 1294.9875, "completions/mean_terminated_length": 1294.9875, "completions/min_length": 969.4, "completions/min_terminated_length": 969.4, "entropy": 0.2628588765859604, "epoch": 2.45593419506463, "frac_reward_zero_std": 0.7, "grad_norm": 0.5648078918457031, "learning_rate": 2.6241822146837893e-07, "loss": 0.0025, "num_tokens": 280810599.0, "reward": 0.8257812619209289, "reward_std": 0.0739155262708664, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8257812619209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.29022433459758756, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999966311454773, "sampling/importance_sampling_ratio/min": 0.20766952782869338, "sampling/sampling_logp_difference/max": 1.7227513313293457, "sampling/sampling_logp_difference/mean": 0.013284758664667606, "step": 2090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1860.0, "completions/max_terminated_length": 1860.0, "completions/mean_length": 1239.553125, "completions/mean_terminated_length": 1239.553125, "completions/min_length": 862.2, "completions/min_terminated_length": 862.2, "entropy": 0.2770519554615021, "epoch": 2.461809635722679, "frac_reward_zero_std": 0.55, "grad_norm": 0.43463632464408875, "learning_rate": 2.618124545674824e-07, "loss": 0.0122, "num_tokens": 281529608.0, "reward": 0.7165104269981384, "reward_std": 0.07144647724926471, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7165104269981384, "rewards/e2e_recall_precision_mixed_reward/std": 0.32212514281272886, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000048518180846, "sampling/importance_sampling_ratio/min": 0.366385692358017, "sampling/sampling_logp_difference/max": 1.152387547492981, "sampling/sampling_logp_difference/mean": 0.014233771339058876, "step": 2095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1768.6, "completions/max_terminated_length": 1768.6, "completions/mean_length": 1226.5125, "completions/mean_terminated_length": 1226.5125, "completions/min_length": 857.6, "completions/min_terminated_length": 857.6, "entropy": 0.2612621784210205, "epoch": 2.4676850763807288, "frac_reward_zero_std": 0.6, "grad_norm": 0.3621138632297516, "learning_rate": 2.6120668766658586e-07, "loss": -0.0037, "num_tokens": 282204684.0, "reward": 0.8427083492279053, "reward_std": 0.07418096661567689, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8427083492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.22203905284404754, "sampling/importance_sampling_ratio/max": 1.9696928262710571, "sampling/importance_sampling_ratio/mean": 0.9999515414237976, "sampling/importance_sampling_ratio/min": 0.32047736793756487, "sampling/sampling_logp_difference/max": 1.3083037376403808, "sampling/sampling_logp_difference/mean": 0.013291514292359353, "step": 2100 }, { "epoch": 2.4676850763807288, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.000625, "eval_completions/max_length": 1706.68, "eval_completions/max_terminated_length": 1705.28, "eval_completions/mean_length": 1182.015625, "eval_completions/mean_terminated_length": 1181.2093798828125, "eval_completions/min_length": 865.84, "eval_completions/min_terminated_length": 865.84, "eval_entropy": 0.2664040964841843, "eval_frac_reward_zero_std": 0.6, "eval_loss": -0.0011476209620013833, "eval_num_tokens": 282204684.0, "eval_reward": 0.7496771001815796, "eval_reward_std": 0.08451524078845978, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7496771001815796, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.3025799497961998, "eval_runtime": 464.0933, "eval_samples_per_second": 0.215, "eval_sampling/importance_sampling_ratio/max": 1.923496961593628, "eval_sampling/importance_sampling_ratio/mean": 0.9999792790412902, "eval_sampling/importance_sampling_ratio/min": 0.3148359860479832, "eval_sampling/sampling_logp_difference/max": 1.4014044141769408, "eval_sampling/sampling_logp_difference/mean": 0.013713168315589427, "eval_steps_per_second": 0.004, "step": 2100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1992.8, "completions/max_terminated_length": 1992.8, "completions/mean_length": 1313.50625, "completions/mean_terminated_length": 1313.50625, "completions/min_length": 933.2, "completions/min_terminated_length": 933.2, "entropy": 0.2658017486333847, "epoch": 2.473560517038778, "frac_reward_zero_std": 0.55, "grad_norm": 1.3053785562515259, "learning_rate": 2.6060092076568935e-07, "loss": -0.0029, "num_tokens": 282926030.0, "reward": 0.8159895896911621, "reward_std": 0.10865789279341698, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8159895896911621, "rewards/e2e_recall_precision_mixed_reward/std": 0.277511340379715, "sampling/importance_sampling_ratio/max": 1.9750770807266236, "sampling/importance_sampling_ratio/mean": 1.0000683188438415, "sampling/importance_sampling_ratio/min": 0.35810833275318144, "sampling/sampling_logp_difference/max": 1.1295485258102418, "sampling/sampling_logp_difference/mean": 0.013428233750164508, "step": 2105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1765.4, "completions/max_terminated_length": 1765.4, "completions/mean_length": 1209.60625, "completions/mean_terminated_length": 1209.60625, "completions/min_length": 854.2, "completions/min_terminated_length": 854.2, "entropy": 0.2738288462162018, "epoch": 2.4794359576968272, "frac_reward_zero_std": 0.55, "grad_norm": 0.4568771719932556, "learning_rate": 2.5999515386479284e-07, "loss": 0.0041, "num_tokens": 283648064.0, "reward": 0.835156261920929, "reward_std": 0.07496549636125564, "rewards/e2e_recall_precision_mixed_reward/mean": 0.835156261920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.21661584377288817, "sampling/importance_sampling_ratio/max": 1.8551706790924072, "sampling/importance_sampling_ratio/mean": 0.9999988913536072, "sampling/importance_sampling_ratio/min": 0.3922689139842987, "sampling/sampling_logp_difference/max": 0.945212459564209, "sampling/sampling_logp_difference/mean": 0.014047329686582089, "step": 2110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1461.0, "completions/max_terminated_length": 1461.0, "completions/mean_length": 1102.090625, "completions/mean_terminated_length": 1102.090625, "completions/min_length": 819.0, "completions/min_terminated_length": 819.0, "entropy": 0.24425167739391326, "epoch": 2.4853113983548765, "frac_reward_zero_std": 0.6, "grad_norm": 0.5156678557395935, "learning_rate": 2.593893869638963e-07, "loss": -0.0031, "num_tokens": 284362237.0, "reward": 0.7421875238418579, "reward_std": 0.07766841053962707, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7421875238418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.3157324016094208, "sampling/importance_sampling_ratio/max": 1.9938726425170898, "sampling/importance_sampling_ratio/mean": 0.9999809503555298, "sampling/importance_sampling_ratio/min": 0.3194470554590225, "sampling/sampling_logp_difference/max": 1.3359221458435058, "sampling/sampling_logp_difference/mean": 0.013078029081225396, "step": 2115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1704.8, "completions/max_terminated_length": 1704.8, "completions/mean_length": 1215.6125, "completions/mean_terminated_length": 1215.6125, "completions/min_length": 815.6, "completions/min_terminated_length": 815.6, "entropy": 0.28841713070869446, "epoch": 2.491186839012926, "frac_reward_zero_std": 0.55, "grad_norm": 0.5904704332351685, "learning_rate": 2.5878362006299976e-07, "loss": -0.0011, "num_tokens": 285089361.0, "reward": 0.7183854222297669, "reward_std": 0.08566985726356506, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7183854222297669, "rewards/e2e_recall_precision_mixed_reward/std": 0.3276542216539383, "sampling/importance_sampling_ratio/max": 1.9800874948501588, "sampling/importance_sampling_ratio/mean": 1.00006422996521, "sampling/importance_sampling_ratio/min": 0.28533509075641633, "sampling/sampling_logp_difference/max": 1.3343998670578003, "sampling/sampling_logp_difference/mean": 0.014605691842734813, "step": 2120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1797.0, "completions/max_terminated_length": 1701.8, "completions/mean_length": 1217.321875, "completions/mean_terminated_length": 1213.289013671875, "completions/min_length": 903.0, "completions/min_terminated_length": 903.0, "entropy": 0.26930312514305116, "epoch": 2.4970622796709754, "frac_reward_zero_std": 0.4, "grad_norm": 0.6213445067405701, "learning_rate": 2.581778531621032e-07, "loss": -0.0049, "num_tokens": 285775444.0, "reward": 0.8020833611488343, "reward_std": 0.13593244403600693, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8020833611488343, "rewards/e2e_recall_precision_mixed_reward/std": 0.28477261066436765, "sampling/importance_sampling_ratio/max": 1.9315150022506713, "sampling/importance_sampling_ratio/mean": 0.9999381899833679, "sampling/importance_sampling_ratio/min": 0.34449381977319715, "sampling/sampling_logp_difference/max": 1.3405115842819213, "sampling/sampling_logp_difference/mean": 0.013642283342778682, "step": 2125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 2057.2, "completions/max_terminated_length": 1985.0, "completions/mean_length": 1294.4125, "completions/mean_terminated_length": 1291.0941650390625, "completions/min_length": 868.2, "completions/min_terminated_length": 868.2, "entropy": 0.26585444808006287, "epoch": 2.5029377203290246, "frac_reward_zero_std": 0.55, "grad_norm": 0.6043793559074402, "learning_rate": 2.575720862612067e-07, "loss": 0.009, "num_tokens": 286531988.0, "reward": 0.8127604246139526, "reward_std": 0.09144037812948227, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8127604246139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.2378230720758438, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999789953231811, "sampling/importance_sampling_ratio/min": 0.34047214686870575, "sampling/sampling_logp_difference/max": 1.1415401697158813, "sampling/sampling_logp_difference/mean": 0.013809867203235626, "step": 2130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1787.8, "completions/max_terminated_length": 1787.8, "completions/mean_length": 1196.678125, "completions/mean_terminated_length": 1196.678125, "completions/min_length": 850.0, "completions/min_terminated_length": 850.0, "entropy": 0.26385495364665984, "epoch": 2.5088131609870743, "frac_reward_zero_std": 0.6, "grad_norm": 0.4344102740287781, "learning_rate": 2.569663193603102e-07, "loss": 0.0036, "num_tokens": 287241901.0, "reward": 0.7539583444595337, "reward_std": 0.09427325129508972, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7539583444595337, "rewards/e2e_recall_precision_mixed_reward/std": 0.350679612159729, "sampling/importance_sampling_ratio/max": 1.8801328659057617, "sampling/importance_sampling_ratio/mean": 1.0000503182411193, "sampling/importance_sampling_ratio/min": 0.29096491932868956, "sampling/sampling_logp_difference/max": 1.2413696765899658, "sampling/sampling_logp_difference/mean": 0.013599349372088908, "step": 2135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1791.0, "completions/max_terminated_length": 1791.0, "completions/mean_length": 1240.184375, "completions/mean_terminated_length": 1240.184375, "completions/min_length": 815.2, "completions/min_terminated_length": 815.2, "entropy": 0.2642264664173126, "epoch": 2.5146886016451235, "frac_reward_zero_std": 0.45, "grad_norm": 0.4803769588470459, "learning_rate": 2.563605524594136e-07, "loss": -0.0011, "num_tokens": 287960408.0, "reward": 0.7619791865348816, "reward_std": 0.10066340118646622, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7619791984558105, "rewards/e2e_recall_precision_mixed_reward/std": 0.2697122097015381, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999095678329468, "sampling/importance_sampling_ratio/min": 0.22195086255669594, "sampling/sampling_logp_difference/max": 1.8092245578765869, "sampling/sampling_logp_difference/mean": 0.013568362034857274, "step": 2140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1848.6, "completions/max_terminated_length": 1848.6, "completions/mean_length": 1211.046875, "completions/mean_terminated_length": 1211.046875, "completions/min_length": 914.0, "completions/min_terminated_length": 914.0, "entropy": 0.2546990215778351, "epoch": 2.5205640423031728, "frac_reward_zero_std": 0.6, "grad_norm": 0.43225082755088806, "learning_rate": 2.557547855585171e-07, "loss": -0.0026, "num_tokens": 288649303.0, "reward": 0.8182291865348816, "reward_std": 0.08594144955277443, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8182291865348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.24452958032488822, "sampling/importance_sampling_ratio/max": 1.9426300525665283, "sampling/importance_sampling_ratio/mean": 1.0000913858413696, "sampling/importance_sampling_ratio/min": 0.28692914694547655, "sampling/sampling_logp_difference/max": 1.36034255027771, "sampling/sampling_logp_difference/mean": 0.013059111684560776, "step": 2145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1521.6, "completions/max_terminated_length": 1521.6, "completions/mean_length": 1158.971875, "completions/mean_terminated_length": 1158.971875, "completions/min_length": 846.4, "completions/min_terminated_length": 846.4, "entropy": 0.2745051383972168, "epoch": 2.526439482961222, "frac_reward_zero_std": 0.5, "grad_norm": 0.663104772567749, "learning_rate": 2.5514901865762054e-07, "loss": 0.0025, "num_tokens": 289380430.0, "reward": 0.8361458539962768, "reward_std": 0.10924627855420113, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8361458539962768, "rewards/e2e_recall_precision_mixed_reward/std": 0.2811933159828186, "sampling/importance_sampling_ratio/max": 1.9657442808151244, "sampling/importance_sampling_ratio/mean": 0.9999931931495667, "sampling/importance_sampling_ratio/min": 0.39454739093780516, "sampling/sampling_logp_difference/max": 0.9744221210479737, "sampling/sampling_logp_difference/mean": 0.014131250046193599, "step": 2150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1660.0, "completions/max_terminated_length": 1660.0, "completions/mean_length": 1141.4625, "completions/mean_terminated_length": 1141.4625, "completions/min_length": 798.2, "completions/min_terminated_length": 798.2, "entropy": 0.2594828069210052, "epoch": 2.5323149236192712, "frac_reward_zero_std": 0.45, "grad_norm": 1.1496278047561646, "learning_rate": 2.54543251756724e-07, "loss": 0.0095, "num_tokens": 290082946.0, "reward": 0.86953125, "reward_std": 0.10211466997861862, "rewards/e2e_recall_precision_mixed_reward/mean": 0.869531261920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.22828298360109328, "sampling/importance_sampling_ratio/max": 1.914345669746399, "sampling/importance_sampling_ratio/mean": 0.9999447941780091, "sampling/importance_sampling_ratio/min": 0.33225939571857455, "sampling/sampling_logp_difference/max": 1.1551615476608277, "sampling/sampling_logp_difference/mean": 0.013497978821396828, "step": 2155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1804.8, "completions/max_terminated_length": 1804.8, "completions/mean_length": 1214.8125, "completions/mean_terminated_length": 1214.8125, "completions/min_length": 881.0, "completions/min_terminated_length": 881.0, "entropy": 0.2724804818630219, "epoch": 2.538190364277321, "frac_reward_zero_std": 0.65, "grad_norm": 0.44894957542419434, "learning_rate": 2.5393748485582747e-07, "loss": 0.0016, "num_tokens": 290774182.0, "reward": 0.909375011920929, "reward_std": 0.06227758340537548, "rewards/e2e_recall_precision_mixed_reward/mean": 0.909375011920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.18587272763252258, "sampling/importance_sampling_ratio/max": 1.8864954710006714, "sampling/importance_sampling_ratio/mean": 0.9999249219894409, "sampling/importance_sampling_ratio/min": 0.313013830780983, "sampling/sampling_logp_difference/max": 1.5233005046844483, "sampling/sampling_logp_difference/mean": 0.013677316159009934, "step": 2160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1856.6, "completions/max_terminated_length": 1856.6, "completions/mean_length": 1215.196875, "completions/mean_terminated_length": 1215.196875, "completions/min_length": 869.4, "completions/min_terminated_length": 869.4, "entropy": 0.26370081305503845, "epoch": 2.54406580493537, "frac_reward_zero_std": 0.6, "grad_norm": 0.583694577217102, "learning_rate": 2.533317179549309e-07, "loss": 0.0099, "num_tokens": 291523045.0, "reward": 0.7243229269981384, "reward_std": 0.07695610821247101, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7243229269981384, "rewards/e2e_recall_precision_mixed_reward/std": 0.2652903199195862, "sampling/importance_sampling_ratio/max": 1.9399638891220092, "sampling/importance_sampling_ratio/mean": 1.000074291229248, "sampling/importance_sampling_ratio/min": 0.25444764718413354, "sampling/sampling_logp_difference/max": 1.7197787404060363, "sampling/sampling_logp_difference/mean": 0.013512972928583621, "step": 2165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1562.8, "completions/max_terminated_length": 1562.8, "completions/mean_length": 1180.584375, "completions/mean_terminated_length": 1180.584375, "completions/min_length": 923.2, "completions/min_terminated_length": 923.2, "entropy": 0.25344176292419435, "epoch": 2.5499412455934194, "frac_reward_zero_std": 0.8, "grad_norm": 0.4013759195804596, "learning_rate": 2.527259510540344e-07, "loss": 0.0009, "num_tokens": 292235200.0, "reward": 0.818750011920929, "reward_std": 0.04898076355457306, "rewards/e2e_recall_precision_mixed_reward/mean": 0.818750011920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.25387853682041167, "sampling/importance_sampling_ratio/max": 1.9529661893844605, "sampling/importance_sampling_ratio/mean": 1.0000003099441528, "sampling/importance_sampling_ratio/min": 0.333037468791008, "sampling/sampling_logp_difference/max": 1.4715055704116822, "sampling/sampling_logp_difference/mean": 0.013367529399693013, "step": 2170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1429.2, "completions/max_terminated_length": 1429.2, "completions/mean_length": 1136.7625, "completions/mean_terminated_length": 1136.7625, "completions/min_length": 874.8, "completions/min_terminated_length": 874.8, "entropy": 0.2680712938308716, "epoch": 2.555816686251469, "frac_reward_zero_std": 0.6, "grad_norm": 0.7385175228118896, "learning_rate": 2.5212018415313783e-07, "loss": 0.0054, "num_tokens": 292919940.0, "reward": 0.8784374952316284, "reward_std": 0.06511118579655886, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8784374952316284, "rewards/e2e_recall_precision_mixed_reward/std": 0.20323096066713334, "sampling/importance_sampling_ratio/max": 1.9584675550460815, "sampling/importance_sampling_ratio/mean": 1.0000271797180176, "sampling/importance_sampling_ratio/min": 0.32080017030239105, "sampling/sampling_logp_difference/max": 1.3096740007400514, "sampling/sampling_logp_difference/mean": 0.013850261084735393, "step": 2175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1661.6, "completions/max_terminated_length": 1661.6, "completions/mean_length": 1151.040625, "completions/mean_terminated_length": 1151.040625, "completions/min_length": 855.2, "completions/min_terminated_length": 855.2, "entropy": 0.25901271402835846, "epoch": 2.5616921269095183, "frac_reward_zero_std": 0.55, "grad_norm": 0.416144460439682, "learning_rate": 2.515144172522413e-07, "loss": -0.0003, "num_tokens": 293619409.0, "reward": 0.7942708492279053, "reward_std": 0.10647799670696259, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7942708492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.2655137896537781, "sampling/importance_sampling_ratio/max": 1.9906936645507813, "sampling/importance_sampling_ratio/mean": 0.9999905347824096, "sampling/importance_sampling_ratio/min": 0.3820026218891144, "sampling/sampling_logp_difference/max": 1.1053644180297852, "sampling/sampling_logp_difference/mean": 0.013533038832247258, "step": 2180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1738.6, "completions/max_terminated_length": 1738.6, "completions/mean_length": 1254.803125, "completions/mean_terminated_length": 1254.803125, "completions/min_length": 936.8, "completions/min_terminated_length": 936.8, "entropy": 0.2685577243566513, "epoch": 2.5675675675675675, "frac_reward_zero_std": 0.55, "grad_norm": 0.6036714315414429, "learning_rate": 2.509086503513448e-07, "loss": -0.0023, "num_tokens": 294351490.0, "reward": 0.871875, "reward_std": 0.10807717889547348, "rewards/e2e_recall_precision_mixed_reward/mean": 0.871875, "rewards/e2e_recall_precision_mixed_reward/std": 0.25218716263771057, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000145792961121, "sampling/importance_sampling_ratio/min": 0.334485599398613, "sampling/sampling_logp_difference/max": 1.3058324337005616, "sampling/sampling_logp_difference/mean": 0.013811002299189568, "step": 2185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1710.2, "completions/max_terminated_length": 1710.2, "completions/mean_length": 1185.6, "completions/mean_terminated_length": 1185.6, "completions/min_length": 926.2, "completions/min_terminated_length": 926.2, "entropy": 0.2519607961177826, "epoch": 2.573443008225617, "frac_reward_zero_std": 0.75, "grad_norm": 0.429861843585968, "learning_rate": 2.5030288345044824e-07, "loss": 0.0046, "num_tokens": 295068802.0, "reward": 0.6942708432674408, "reward_std": 0.0552545927464962, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6942708432674408, "rewards/e2e_recall_precision_mixed_reward/std": 0.28056047260761263, "sampling/importance_sampling_ratio/max": 1.9829387187957763, "sampling/importance_sampling_ratio/mean": 1.0000043034553527, "sampling/importance_sampling_ratio/min": 0.39192359447479247, "sampling/sampling_logp_difference/max": 0.969468641281128, "sampling/sampling_logp_difference/mean": 0.013157148286700249, "step": 2190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1570.6, "completions/max_terminated_length": 1570.6, "completions/mean_length": 1153.78125, "completions/mean_terminated_length": 1153.78125, "completions/min_length": 820.0, "completions/min_terminated_length": 820.0, "entropy": 0.25543263256549836, "epoch": 2.579318448883666, "frac_reward_zero_std": 0.6, "grad_norm": 0.484397828578949, "learning_rate": 2.4969711654955173e-07, "loss": 0.0083, "num_tokens": 295767884.0, "reward": 0.7614583492279052, "reward_std": 0.10435431525111198, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7614583492279052, "rewards/e2e_recall_precision_mixed_reward/std": 0.2718092933297157, "sampling/importance_sampling_ratio/max": 1.9637941360473632, "sampling/importance_sampling_ratio/mean": 0.9999634385108948, "sampling/importance_sampling_ratio/min": 0.3485614687204361, "sampling/sampling_logp_difference/max": 1.1356598377227782, "sampling/sampling_logp_difference/mean": 0.013540227897465229, "step": 2195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1515.8, "completions/max_terminated_length": 1515.8, "completions/mean_length": 1139.3125, "completions/mean_terminated_length": 1139.3125, "completions/min_length": 892.4, "completions/min_terminated_length": 892.4, "entropy": 0.24383938014507295, "epoch": 2.5851938895417157, "frac_reward_zero_std": 0.25, "grad_norm": 0.8983497023582458, "learning_rate": 2.4909134964865517e-07, "loss": 0.0002, "num_tokens": 296434144.0, "reward": 0.7223958492279052, "reward_std": 0.12622348368167877, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7223958492279052, "rewards/e2e_recall_precision_mixed_reward/std": 0.29917227625846865, "sampling/importance_sampling_ratio/max": 1.972477102279663, "sampling/importance_sampling_ratio/mean": 0.9999940752983093, "sampling/importance_sampling_ratio/min": 0.3770484387874603, "sampling/sampling_logp_difference/max": 1.139905858039856, "sampling/sampling_logp_difference/mean": 0.01284611839801073, "step": 2200 }, { "epoch": 2.5851938895417157, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.000625, "eval_completions/max_length": 1606.24, "eval_completions/max_terminated_length": 1601.2, "eval_completions/mean_length": 1145.1075, "eval_completions/mean_terminated_length": 1144.29046875, "eval_completions/min_length": 848.8, "eval_completions/min_terminated_length": 848.8, "eval_entropy": 0.26203078508377076, "eval_frac_reward_zero_std": 0.62, "eval_loss": 0.0016268673352897167, "eval_num_tokens": 296434144.0, "eval_reward": 0.7521041774749756, "eval_reward_std": 0.0784831927716732, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7521041774749756, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.29837510973215103, "eval_runtime": 437.465, "eval_samples_per_second": 0.229, "eval_sampling/importance_sampling_ratio/max": 1.9403350448608399, "eval_sampling/importance_sampling_ratio/mean": 1.000009639263153, "eval_sampling/importance_sampling_ratio/min": 0.3114258821308613, "eval_sampling/sampling_logp_difference/max": 1.4294465684890747, "eval_sampling/sampling_logp_difference/mean": 0.013600032962858676, "eval_steps_per_second": 0.005, "step": 2200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1612.0, "completions/max_terminated_length": 1612.0, "completions/mean_length": 1124.58125, "completions/mean_terminated_length": 1124.58125, "completions/min_length": 762.4, "completions/min_terminated_length": 762.4, "entropy": 0.2596117079257965, "epoch": 2.591069330199765, "frac_reward_zero_std": 0.6, "grad_norm": 0.47432422637939453, "learning_rate": 2.4848558274775866e-07, "loss": 0.0061, "num_tokens": 297112778.0, "reward": 0.8008854269981385, "reward_std": 0.06376661993563175, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8008854269981385, "rewards/e2e_recall_precision_mixed_reward/std": 0.33332130759954454, "sampling/importance_sampling_ratio/max": 1.938689661026001, "sampling/importance_sampling_ratio/mean": 0.9999597907066345, "sampling/importance_sampling_ratio/min": 0.3991494715213776, "sampling/sampling_logp_difference/max": 1.0992213487625122, "sampling/sampling_logp_difference/mean": 0.013456992991268634, "step": 2205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1443.0, "completions/max_terminated_length": 1443.0, "completions/mean_length": 1105.88125, "completions/mean_terminated_length": 1105.88125, "completions/min_length": 864.0, "completions/min_terminated_length": 864.0, "entropy": 0.24657217562198638, "epoch": 2.5969447708578146, "frac_reward_zero_std": 0.55, "grad_norm": 0.0, "learning_rate": 2.4787981584686215e-07, "loss": -0.004, "num_tokens": 297772052.0, "reward": 0.8242187619209289, "reward_std": 0.10624904036521912, "rewards/e2e_recall_precision_mixed_reward/mean": 0.82421875, "rewards/e2e_recall_precision_mixed_reward/std": 0.20129311084747314, "sampling/importance_sampling_ratio/max": 1.959466028213501, "sampling/importance_sampling_ratio/mean": 1.0000609636306763, "sampling/importance_sampling_ratio/min": 0.339236655831337, "sampling/sampling_logp_difference/max": 1.203588342666626, "sampling/sampling_logp_difference/mean": 0.013020346313714981, "step": 2210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1395.2, "completions/max_terminated_length": 1395.2, "completions/mean_length": 1025.321875, "completions/mean_terminated_length": 1025.321875, "completions/min_length": 698.0, "completions/min_terminated_length": 698.0, "entropy": 0.2446480482816696, "epoch": 2.602820211515864, "frac_reward_zero_std": 0.6, "grad_norm": 0.6485159993171692, "learning_rate": 2.472740489459656e-07, "loss": 0.0036, "num_tokens": 298428219.0, "reward": 0.7654687643051148, "reward_std": 0.07041542753577232, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7654687643051148, "rewards/e2e_recall_precision_mixed_reward/std": 0.31836703717708587, "sampling/importance_sampling_ratio/max": 1.929987335205078, "sampling/importance_sampling_ratio/mean": 0.9999933481216431, "sampling/importance_sampling_ratio/min": 0.37912967801094055, "sampling/sampling_logp_difference/max": 0.9853403449058533, "sampling/sampling_logp_difference/mean": 0.012882444821298123, "step": 2215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1636.4, "completions/max_terminated_length": 1636.4, "completions/mean_length": 1112.94375, "completions/mean_terminated_length": 1112.94375, "completions/min_length": 722.4, "completions/min_terminated_length": 722.4, "entropy": 0.24474719762802125, "epoch": 2.608695652173913, "frac_reward_zero_std": 0.45, "grad_norm": 0.7990810871124268, "learning_rate": 2.46668282045069e-07, "loss": -0.0006, "num_tokens": 299120297.0, "reward": 0.7864583611488343, "reward_std": 0.10424329489469528, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7864583611488343, "rewards/e2e_recall_precision_mixed_reward/std": 0.305034938454628, "sampling/importance_sampling_ratio/max": 1.9024056434631347, "sampling/importance_sampling_ratio/mean": 0.9999809026718139, "sampling/importance_sampling_ratio/min": 0.2755643067397159, "sampling/sampling_logp_difference/max": 3.7205732345581053, "sampling/sampling_logp_difference/mean": 0.013068239949643613, "step": 2220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1734.8, "completions/max_terminated_length": 1708.6, "completions/mean_length": 1183.4, "completions/mean_terminated_length": 1179.7350830078126, "completions/min_length": 819.6, "completions/min_terminated_length": 819.6, "entropy": 0.259357213973999, "epoch": 2.6145710928319623, "frac_reward_zero_std": 0.4, "grad_norm": 0.7435207366943359, "learning_rate": 2.460625151441725e-07, "loss": -0.0046, "num_tokens": 299801477.0, "reward": 0.7505208492279053, "reward_std": 0.13032824955880642, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7505208492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.29717023074626925, "sampling/importance_sampling_ratio/max": 1.993881344795227, "sampling/importance_sampling_ratio/mean": 1.000022292137146, "sampling/importance_sampling_ratio/min": 0.3933206915855408, "sampling/sampling_logp_difference/max": 1.1224096775054933, "sampling/sampling_logp_difference/mean": 0.013361809588968755, "step": 2225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1441.4, "completions/max_terminated_length": 1441.4, "completions/mean_length": 1047.221875, "completions/mean_terminated_length": 1047.221875, "completions/min_length": 758.6, "completions/min_terminated_length": 758.6, "entropy": 0.2406633496284485, "epoch": 2.6204465334900116, "frac_reward_zero_std": 0.65, "grad_norm": 0.41060614585876465, "learning_rate": 2.45456748243276e-07, "loss": 0.0021, "num_tokens": 300458348.0, "reward": 0.8791666746139526, "reward_std": 0.06388028524816036, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8791666746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.22914170026779174, "sampling/importance_sampling_ratio/max": 1.9028963804244996, "sampling/importance_sampling_ratio/mean": 0.9999442934989929, "sampling/importance_sampling_ratio/min": 0.3476558208465576, "sampling/sampling_logp_difference/max": 1.069634747505188, "sampling/sampling_logp_difference/mean": 0.01292272675782442, "step": 2230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1604.2, "completions/max_terminated_length": 1604.2, "completions/mean_length": 1084.35, "completions/mean_terminated_length": 1084.35, "completions/min_length": 794.8, "completions/min_terminated_length": 794.8, "entropy": 0.24303655624389647, "epoch": 2.6263219741480612, "frac_reward_zero_std": 0.3, "grad_norm": 0.7626137137413025, "learning_rate": 2.4485098134237944e-07, "loss": 0.0031, "num_tokens": 301145692.0, "reward": 0.775000023841858, "reward_std": 0.14809788316488265, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7750000178813934, "rewards/e2e_recall_precision_mixed_reward/std": 0.32100625038146974, "sampling/importance_sampling_ratio/max": 1.8840370416641234, "sampling/importance_sampling_ratio/mean": 1.0000063896179199, "sampling/importance_sampling_ratio/min": 0.31093878746032716, "sampling/sampling_logp_difference/max": 1.2778079032897949, "sampling/sampling_logp_difference/mean": 0.013110420294106006, "step": 2235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1503.6, "completions/max_terminated_length": 1503.6, "completions/mean_length": 1122.2, "completions/mean_terminated_length": 1122.2, "completions/min_length": 835.0, "completions/min_terminated_length": 835.0, "entropy": 0.2503249257802963, "epoch": 2.6321974148061105, "frac_reward_zero_std": 0.55, "grad_norm": 0.7380458116531372, "learning_rate": 2.442452144414829e-07, "loss": 0.0018, "num_tokens": 301798796.0, "reward": 0.8723958492279053, "reward_std": 0.08415319249033928, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8723958492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.2061576321721077, "sampling/importance_sampling_ratio/max": 1.8270782232284546, "sampling/importance_sampling_ratio/mean": 0.9999822735786438, "sampling/importance_sampling_ratio/min": 0.4466776907444, "sampling/sampling_logp_difference/max": 0.9171277284622192, "sampling/sampling_logp_difference/mean": 0.013088957034051418, "step": 2240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1467.0, "completions/max_terminated_length": 1467.0, "completions/mean_length": 1079.2375, "completions/mean_terminated_length": 1079.2375, "completions/min_length": 786.0, "completions/min_terminated_length": 786.0, "entropy": 0.24606316089630126, "epoch": 2.6380728554641597, "frac_reward_zero_std": 0.55, "grad_norm": 0.49479740858078003, "learning_rate": 2.4363944754058636e-07, "loss": 0.0042, "num_tokens": 302497080.0, "reward": 0.9192708492279053, "reward_std": 0.09399299174547196, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9192708492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.16631564050912856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001073718070983, "sampling/importance_sampling_ratio/min": 0.30234395563602445, "sampling/sampling_logp_difference/max": 1.2849443435668946, "sampling/sampling_logp_difference/mean": 0.01323620304465294, "step": 2245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1562.0, "completions/max_terminated_length": 1562.0, "completions/mean_length": 1161.11875, "completions/mean_terminated_length": 1161.11875, "completions/min_length": 900.0, "completions/min_terminated_length": 900.0, "entropy": 0.2619780212640762, "epoch": 2.6439482961222094, "frac_reward_zero_std": 0.75, "grad_norm": 0.48926234245300293, "learning_rate": 2.4303368063968985e-07, "loss": -0.0007, "num_tokens": 303196526.0, "reward": 0.7757812738418579, "reward_std": 0.046535524725914004, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7757812738418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.36423116475343703, "sampling/importance_sampling_ratio/max": 1.9756156206130981, "sampling/importance_sampling_ratio/mean": 1.000075590610504, "sampling/importance_sampling_ratio/min": 0.34909204840660096, "sampling/sampling_logp_difference/max": 1.066146230697632, "sampling/sampling_logp_difference/mean": 0.013476391322910786, "step": 2250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1581.4, "completions/max_terminated_length": 1581.4, "completions/mean_length": 1124.425, "completions/mean_terminated_length": 1124.425, "completions/min_length": 843.2, "completions/min_terminated_length": 843.2, "entropy": 0.24228012859821318, "epoch": 2.6498237367802586, "frac_reward_zero_std": 0.75, "grad_norm": 0.0, "learning_rate": 2.424279137387933e-07, "loss": 0.0052, "num_tokens": 303868134.0, "reward": 0.9687500119209289, "reward_std": 0.04924879372119904, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9687500119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.09995669350028039, "sampling/importance_sampling_ratio/max": 1.896589422225952, "sampling/importance_sampling_ratio/mean": 1.0000037789344787, "sampling/importance_sampling_ratio/min": 0.38552327156066896, "sampling/sampling_logp_difference/max": 1.0073142290115356, "sampling/sampling_logp_difference/mean": 0.01276505459100008, "step": 2255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1599.6, "completions/max_terminated_length": 1599.6, "completions/mean_length": 1126.265625, "completions/mean_terminated_length": 1126.265625, "completions/min_length": 868.2, "completions/min_terminated_length": 868.2, "entropy": 0.2533190757036209, "epoch": 2.655699177438308, "frac_reward_zero_std": 0.3, "grad_norm": 0.824459433555603, "learning_rate": 2.418221468378968e-07, "loss": 0.0002, "num_tokens": 304553291.0, "reward": 0.8223958611488342, "reward_std": 0.12836966216564177, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8223958611488342, "rewards/e2e_recall_precision_mixed_reward/std": 0.2609886109828949, "sampling/importance_sampling_ratio/max": 1.8384875535964966, "sampling/importance_sampling_ratio/mean": 0.9999510169029235, "sampling/importance_sampling_ratio/min": 0.4050338566303253, "sampling/sampling_logp_difference/max": 0.9330639719963074, "sampling/sampling_logp_difference/mean": 0.01324385330080986, "step": 2260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1628.8, "completions/max_terminated_length": 1628.8, "completions/mean_length": 1131.925, "completions/mean_terminated_length": 1131.925, "completions/min_length": 867.0, "completions/min_terminated_length": 867.0, "entropy": 0.25666095316410065, "epoch": 2.661574618096357, "frac_reward_zero_std": 0.4, "grad_norm": 0.6386078000068665, "learning_rate": 2.412163799370002e-07, "loss": 0.0038, "num_tokens": 305229475.0, "reward": 0.8148958563804627, "reward_std": 0.11356194913387299, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8148958563804627, "rewards/e2e_recall_precision_mixed_reward/std": 0.25244076550006866, "sampling/importance_sampling_ratio/max": 1.9641210556030273, "sampling/importance_sampling_ratio/mean": 0.999966835975647, "sampling/importance_sampling_ratio/min": 0.3801779314875603, "sampling/sampling_logp_difference/max": 1.1193055629730224, "sampling/sampling_logp_difference/mean": 0.01317644640803337, "step": 2265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1700.0, "completions/max_terminated_length": 1700.0, "completions/mean_length": 1193.60625, "completions/mean_terminated_length": 1193.60625, "completions/min_length": 856.2, "completions/min_terminated_length": 856.2, "entropy": 0.2445121705532074, "epoch": 2.6674500587544063, "frac_reward_zero_std": 0.45, "grad_norm": 0.40095868706703186, "learning_rate": 2.406106130361037e-07, "loss": -0.0086, "num_tokens": 305924837.0, "reward": 0.8000520944595337, "reward_std": 0.11730363368988037, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8000520944595337, "rewards/e2e_recall_precision_mixed_reward/std": 0.28921135514974594, "sampling/importance_sampling_ratio/max": 1.9419026374816895, "sampling/importance_sampling_ratio/mean": 0.9999643087387085, "sampling/importance_sampling_ratio/min": 0.32038319408893584, "sampling/sampling_logp_difference/max": 1.219898271560669, "sampling/sampling_logp_difference/mean": 0.0127852413803339, "step": 2270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1673.2, "completions/max_terminated_length": 1673.2, "completions/mean_length": 1131.45, "completions/mean_terminated_length": 1131.45, "completions/min_length": 824.4, "completions/min_terminated_length": 824.4, "entropy": 0.24599368572235109, "epoch": 2.673325499412456, "frac_reward_zero_std": 0.6, "grad_norm": 0.48403316736221313, "learning_rate": 2.4000484613520714e-07, "loss": 0.0089, "num_tokens": 306580261.0, "reward": 0.7893229305744172, "reward_std": 0.07952911332249642, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7893229305744172, "rewards/e2e_recall_precision_mixed_reward/std": 0.27056502997875215, "sampling/importance_sampling_ratio/max": 1.9567157745361328, "sampling/importance_sampling_ratio/mean": 0.9999315857887268, "sampling/importance_sampling_ratio/min": 0.3672967258840799, "sampling/sampling_logp_difference/max": 1.4008118629455566, "sampling/sampling_logp_difference/mean": 0.012783203460276126, "step": 2275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1547.6, "completions/max_terminated_length": 1547.6, "completions/mean_length": 1147.421875, "completions/mean_terminated_length": 1147.421875, "completions/min_length": 876.6, "completions/min_terminated_length": 876.6, "entropy": 0.2490744948387146, "epoch": 2.6792009400705052, "frac_reward_zero_std": 0.7, "grad_norm": 0.7384084463119507, "learning_rate": 2.3939907923431063e-07, "loss": -0.0021, "num_tokens": 307250684.0, "reward": 0.8109375, "reward_std": 0.0812767967581749, "rewards/e2e_recall_precision_mixed_reward/mean": 0.810937511920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.27321686446666715, "sampling/importance_sampling_ratio/max": 1.9557661771774293, "sampling/importance_sampling_ratio/mean": 0.9999991416931152, "sampling/importance_sampling_ratio/min": 0.3092913806438446, "sampling/sampling_logp_difference/max": 1.2244453430175781, "sampling/sampling_logp_difference/mean": 0.012955594807863235, "step": 2280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1552.8, "completions/max_terminated_length": 1552.8, "completions/mean_length": 1139.478125, "completions/mean_terminated_length": 1139.478125, "completions/min_length": 891.4, "completions/min_terminated_length": 891.4, "entropy": 0.24866257309913636, "epoch": 2.6850763807285545, "frac_reward_zero_std": 0.45, "grad_norm": 0.5620933175086975, "learning_rate": 2.387933123334141e-07, "loss": 0.01, "num_tokens": 307932517.0, "reward": 0.8145833492279053, "reward_std": 0.10678637623786927, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8145833492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.2612167507410049, "sampling/importance_sampling_ratio/max": 1.8827569007873535, "sampling/importance_sampling_ratio/mean": 1.0000137686729431, "sampling/importance_sampling_ratio/min": 0.3459669291973114, "sampling/sampling_logp_difference/max": 1.0946119785308839, "sampling/sampling_logp_difference/mean": 0.013137634284794331, "step": 2285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1475.8, "completions/max_terminated_length": 1475.8, "completions/mean_length": 1094.653125, "completions/mean_terminated_length": 1094.653125, "completions/min_length": 844.4, "completions/min_terminated_length": 844.4, "entropy": 0.24577432572841645, "epoch": 2.690951821386604, "frac_reward_zero_std": 0.55, "grad_norm": 0.7518845796585083, "learning_rate": 2.3818754543251755e-07, "loss": -0.0043, "num_tokens": 308583686.0, "reward": 0.8395833492279052, "reward_std": 0.09295275211334228, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8395833492279052, "rewards/e2e_recall_precision_mixed_reward/std": 0.2516939163208008, "sampling/importance_sampling_ratio/max": 1.9372399568557739, "sampling/importance_sampling_ratio/mean": 0.9999632596969604, "sampling/importance_sampling_ratio/min": 0.3853158295154572, "sampling/sampling_logp_difference/max": 1.0470689058303833, "sampling/sampling_logp_difference/mean": 0.012928933463990688, "step": 2290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1571.6, "completions/max_terminated_length": 1571.6, "completions/mean_length": 1107.1375, "completions/mean_terminated_length": 1107.1375, "completions/min_length": 784.2, "completions/min_terminated_length": 784.2, "entropy": 0.237555655837059, "epoch": 2.6968272620446534, "frac_reward_zero_std": 0.3, "grad_norm": 0.7750040292739868, "learning_rate": 2.3758177853162102e-07, "loss": -0.0012, "num_tokens": 309254690.0, "reward": 0.801927101612091, "reward_std": 0.1355880841612816, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8019270896911621, "rewards/e2e_recall_precision_mixed_reward/std": 0.2576450608670712, "sampling/importance_sampling_ratio/max": 1.963302206993103, "sampling/importance_sampling_ratio/mean": 1.0001328825950622, "sampling/importance_sampling_ratio/min": 0.3588021665811539, "sampling/sampling_logp_difference/max": 1.0835482478141785, "sampling/sampling_logp_difference/mean": 0.012560645118355751, "step": 2295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1334.6, "completions/max_terminated_length": 1334.6, "completions/mean_length": 1019.06875, "completions/mean_terminated_length": 1019.06875, "completions/min_length": 742.4, "completions/min_terminated_length": 742.4, "entropy": 0.2461823046207428, "epoch": 2.7027027027027026, "frac_reward_zero_std": 0.55, "grad_norm": 0.4063376486301422, "learning_rate": 2.3697601163072448e-07, "loss": 0.0081, "num_tokens": 309941384.0, "reward": 0.7347396016120911, "reward_std": 0.10757745876908302, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7347396016120911, "rewards/e2e_recall_precision_mixed_reward/std": 0.3722479432821274, "sampling/importance_sampling_ratio/max": 1.9909217596054076, "sampling/importance_sampling_ratio/mean": 0.9999774694442749, "sampling/importance_sampling_ratio/min": 0.40027025938034055, "sampling/sampling_logp_difference/max": 1.0413033485412597, "sampling/sampling_logp_difference/mean": 0.013325695879757404, "step": 2300 }, { "epoch": 2.7027027027027026, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1500.96, "eval_completions/max_terminated_length": 1500.96, "eval_completions/mean_length": 1071.851875, "eval_completions/mean_terminated_length": 1071.851875, "eval_completions/min_length": 804.0, "eval_completions/min_terminated_length": 804.0, "eval_entropy": 0.24838149964809417, "eval_frac_reward_zero_std": 0.57, "eval_loss": 0.0029058277141302824, "eval_num_tokens": 309941384.0, "eval_reward": 0.7540104258060455, "eval_reward_std": 0.08435056537389755, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7540104258060455, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.3009152537584305, "eval_runtime": 409.9966, "eval_samples_per_second": 0.244, "eval_sampling/importance_sampling_ratio/max": 1.9231956005096436, "eval_sampling/importance_sampling_ratio/mean": 1.0000133728981018, "eval_sampling/importance_sampling_ratio/min": 0.3962679693102837, "eval_sampling/sampling_logp_difference/max": 1.0625820803642272, "eval_sampling/sampling_logp_difference/mean": 0.013145512826740742, "eval_steps_per_second": 0.005, "step": 2300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1685.8, "completions/max_terminated_length": 1685.8, "completions/mean_length": 1141.159375, "completions/mean_terminated_length": 1141.159375, "completions/min_length": 790.2, "completions/min_terminated_length": 790.2, "entropy": 0.24258246421813964, "epoch": 2.708578143360752, "frac_reward_zero_std": 0.45, "grad_norm": 0.5708547234535217, "learning_rate": 2.3637024472982794e-07, "loss": 0.01, "num_tokens": 310627675.0, "reward": 0.7439583539962769, "reward_std": 0.08287105187773705, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7439583539962769, "rewards/e2e_recall_precision_mixed_reward/std": 0.3338875025510788, "sampling/importance_sampling_ratio/max": 1.9743703126907348, "sampling/importance_sampling_ratio/mean": 1.0000213861465455, "sampling/importance_sampling_ratio/min": 0.3426578164100647, "sampling/sampling_logp_difference/max": 1.1372323036193848, "sampling/sampling_logp_difference/mean": 0.012876101024448871, "step": 2305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1322.8, "completions/max_terminated_length": 1322.8, "completions/mean_length": 1040.896875, "completions/mean_terminated_length": 1040.896875, "completions/min_length": 783.8, "completions/min_terminated_length": 783.8, "entropy": 0.23975794315338134, "epoch": 2.7144535840188015, "frac_reward_zero_std": 0.5, "grad_norm": 0.45213189721107483, "learning_rate": 2.357644778289314e-07, "loss": 0.0011, "num_tokens": 311274810.0, "reward": 0.7901041746139527, "reward_std": 0.09313196986913681, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7901041865348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.30386063158512117, "sampling/importance_sampling_ratio/max": 1.8083451986312866, "sampling/importance_sampling_ratio/mean": 0.9999869465827942, "sampling/importance_sampling_ratio/min": 0.30560941696166993, "sampling/sampling_logp_difference/max": 1.2987362384796142, "sampling/sampling_logp_difference/mean": 0.012629561126232147, "step": 2310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1463.6, "completions/max_terminated_length": 1463.6, "completions/mean_length": 1084.85625, "completions/mean_terminated_length": 1084.85625, "completions/min_length": 815.6, "completions/min_terminated_length": 815.6, "entropy": 0.23631844222545623, "epoch": 2.720329024676851, "frac_reward_zero_std": 0.55, "grad_norm": 0.7602788209915161, "learning_rate": 2.3515871092803487e-07, "loss": 0.0058, "num_tokens": 311959612.0, "reward": 0.9205729246139527, "reward_std": 0.07825153470039367, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9205729246139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.1699974089860916, "sampling/importance_sampling_ratio/max": 1.9587501287460327, "sampling/importance_sampling_ratio/mean": 0.9999679207801819, "sampling/importance_sampling_ratio/min": 0.47454640865325926, "sampling/sampling_logp_difference/max": 0.770689058303833, "sampling/sampling_logp_difference/mean": 0.012636097148060799, "step": 2315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1321.8, "completions/max_terminated_length": 1321.8, "completions/mean_length": 1015.790625, "completions/mean_terminated_length": 1015.790625, "completions/min_length": 776.4, "completions/min_terminated_length": 776.4, "entropy": 0.225640469789505, "epoch": 2.7262044653349, "frac_reward_zero_std": 0.75, "grad_norm": 0.0, "learning_rate": 2.3455294402713836e-07, "loss": 0.0027, "num_tokens": 312586553.0, "reward": 0.8807291865348816, "reward_std": 0.040735363215208056, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8807291865348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.21484533101320266, "sampling/importance_sampling_ratio/max": 1.9278332471847535, "sampling/importance_sampling_ratio/mean": 0.9999934673309326, "sampling/importance_sampling_ratio/min": 0.41382956355810163, "sampling/sampling_logp_difference/max": 1.2142931938171386, "sampling/sampling_logp_difference/mean": 0.012177078425884247, "step": 2320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1457.6, "completions/max_terminated_length": 1457.6, "completions/mean_length": 1075.2, "completions/mean_terminated_length": 1075.2, "completions/min_length": 785.0, "completions/min_terminated_length": 785.0, "entropy": 0.21975458264350892, "epoch": 2.7320799059929497, "frac_reward_zero_std": 0.6, "grad_norm": 0.6667649745941162, "learning_rate": 2.3394717712624182e-07, "loss": 0.0042, "num_tokens": 313238793.0, "reward": 0.8770833373069763, "reward_std": 0.06933515965938568, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8770833492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.20621103495359422, "sampling/importance_sampling_ratio/max": 1.941676425933838, "sampling/importance_sampling_ratio/mean": 0.9999809026718139, "sampling/importance_sampling_ratio/min": 0.3605471342802048, "sampling/sampling_logp_difference/max": 1.1387160539627075, "sampling/sampling_logp_difference/mean": 0.011648139916360378, "step": 2325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1666.6, "completions/max_terminated_length": 1666.6, "completions/mean_length": 1107.15625, "completions/mean_terminated_length": 1107.15625, "completions/min_length": 758.4, "completions/min_terminated_length": 758.4, "entropy": 0.22798166275024415, "epoch": 2.737955346650999, "frac_reward_zero_std": 0.45, "grad_norm": 0.6299868822097778, "learning_rate": 2.3334141022534528e-07, "loss": 0.0092, "num_tokens": 313900027.0, "reward": 0.8026041865348816, "reward_std": 0.1279518723487854, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8026041865348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.282573002576828, "sampling/importance_sampling_ratio/max": 1.9123780488967896, "sampling/importance_sampling_ratio/mean": 0.9999549150466919, "sampling/importance_sampling_ratio/min": 0.32881303429603576, "sampling/sampling_logp_difference/max": 1.1452240943908691, "sampling/sampling_logp_difference/mean": 0.012409896217286586, "step": 2330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1431.0, "completions/max_terminated_length": 1431.0, "completions/mean_length": 1078.26875, "completions/mean_terminated_length": 1078.26875, "completions/min_length": 853.6, "completions/min_terminated_length": 853.6, "entropy": 0.2364683359861374, "epoch": 2.743830787309048, "frac_reward_zero_std": 0.65, "grad_norm": 0.4049372673034668, "learning_rate": 2.3273564332444875e-07, "loss": -0.0026, "num_tokens": 314558865.0, "reward": 0.9072916746139527, "reward_std": 0.06933557838201523, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9072916746139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.20599967539310454, "sampling/importance_sampling_ratio/max": 1.9302043199539185, "sampling/importance_sampling_ratio/mean": 0.9999824285507202, "sampling/importance_sampling_ratio/min": 0.26731246411800386, "sampling/sampling_logp_difference/max": 1.3574584007263184, "sampling/sampling_logp_difference/mean": 0.012622298114001751, "step": 2335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009375, "completions/max_length": 1699.6, "completions/max_terminated_length": 1661.0, "completions/mean_length": 1115.8625, "completions/mean_terminated_length": 1102.2861083984376, "completions/min_length": 790.0, "completions/min_terminated_length": 790.0, "entropy": 0.23917319178581237, "epoch": 2.7497062279670974, "frac_reward_zero_std": 0.5, "grad_norm": 0.659565269947052, "learning_rate": 2.3212987642355218e-07, "loss": -0.0083, "num_tokens": 315235353.0, "reward": 0.7572916686534882, "reward_std": 0.1146910235285759, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7572916686534882, "rewards/e2e_recall_precision_mixed_reward/std": 0.27366127967834475, "sampling/importance_sampling_ratio/max": 1.9613188266754151, "sampling/importance_sampling_ratio/mean": 1.000109815597534, "sampling/importance_sampling_ratio/min": 0.40567088723182676, "sampling/sampling_logp_difference/max": 0.9738764047622681, "sampling/sampling_logp_difference/mean": 0.012759437412023544, "step": 2340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1602.6, "completions/max_terminated_length": 1490.0, "completions/mean_length": 1066.584375, "completions/mean_terminated_length": 1062.37412109375, "completions/min_length": 722.6, "completions/min_terminated_length": 722.6, "entropy": 0.23375988602638245, "epoch": 2.7555816686251466, "frac_reward_zero_std": 0.5, "grad_norm": 0.0, "learning_rate": 2.3152410952265567e-07, "loss": 0.007, "num_tokens": 315908512.0, "reward": 0.8244791746139526, "reward_std": 0.10080392360687256, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8244791746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.2682786226272583, "sampling/importance_sampling_ratio/max": 1.8856647253036498, "sampling/importance_sampling_ratio/mean": 1.0000274181365967, "sampling/importance_sampling_ratio/min": 0.3129617631435394, "sampling/sampling_logp_difference/max": 1.3798688173294067, "sampling/sampling_logp_difference/mean": 0.012719161063432693, "step": 2345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1614.8, "completions/max_terminated_length": 1614.8, "completions/mean_length": 1075.425, "completions/mean_terminated_length": 1075.425, "completions/min_length": 761.6, "completions/min_terminated_length": 761.6, "entropy": 0.24059076011180877, "epoch": 2.7614571092831963, "frac_reward_zero_std": 0.55, "grad_norm": 0.6796007752418518, "learning_rate": 2.3091834262175914e-07, "loss": 0.0106, "num_tokens": 316558984.0, "reward": 0.7855208516120911, "reward_std": 0.08973032981157303, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7855208516120911, "rewards/e2e_recall_precision_mixed_reward/std": 0.28483576476573946, "sampling/importance_sampling_ratio/max": 1.9070019960403441, "sampling/importance_sampling_ratio/mean": 1.0000741243362428, "sampling/importance_sampling_ratio/min": 0.23457692796364427, "sampling/sampling_logp_difference/max": 2.6379079103469847, "sampling/sampling_logp_difference/mean": 0.013021462410688401, "step": 2350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1548.4, "completions/max_terminated_length": 1519.2, "completions/mean_length": 1059.915625, "completions/mean_terminated_length": 1039.7633544921875, "completions/min_length": 782.6, "completions/min_terminated_length": 782.6, "entropy": 0.22516947686672212, "epoch": 2.7673325499412456, "frac_reward_zero_std": 0.65, "grad_norm": 0.4491136074066162, "learning_rate": 2.303125757208626e-07, "loss": -0.0128, "num_tokens": 317209929.0, "reward": 0.8020833373069763, "reward_std": 0.06487823724746704, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8020833373069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.3081536591053009, "sampling/importance_sampling_ratio/max": 1.9772464275360107, "sampling/importance_sampling_ratio/mean": 1.000032651424408, "sampling/importance_sampling_ratio/min": 0.38993417248129847, "sampling/sampling_logp_difference/max": 1.3113280177116393, "sampling/sampling_logp_difference/mean": 0.012163999117910862, "step": 2355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1625.2, "completions/max_terminated_length": 1625.2, "completions/mean_length": 1096.434375, "completions/mean_terminated_length": 1096.434375, "completions/min_length": 822.6, "completions/min_terminated_length": 822.6, "entropy": 0.23812492191791534, "epoch": 2.773207990599295, "frac_reward_zero_std": 0.5, "grad_norm": 0.4981617033481598, "learning_rate": 2.2970680881996606e-07, "loss": 0.0126, "num_tokens": 317890724.0, "reward": 0.8398437619209289, "reward_std": 0.10197720378637314, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8398437619209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.2605766087770462, "sampling/importance_sampling_ratio/max": 1.9663613796234132, "sampling/importance_sampling_ratio/mean": 0.9999598026275635, "sampling/importance_sampling_ratio/min": 0.3322810932993889, "sampling/sampling_logp_difference/max": 1.3144399881362916, "sampling/sampling_logp_difference/mean": 0.012650839053094387, "step": 2360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1339.6, "completions/max_terminated_length": 1339.6, "completions/mean_length": 995.4, "completions/mean_terminated_length": 995.4, "completions/min_length": 715.8, "completions/min_terminated_length": 715.8, "entropy": 0.21778804659843445, "epoch": 2.7790834312573445, "frac_reward_zero_std": 0.7, "grad_norm": 0.8730181455612183, "learning_rate": 2.2910104191906955e-07, "loss": -0.0016, "num_tokens": 318524436.0, "reward": 0.9052083492279053, "reward_std": 0.06153279021382332, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9052083492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.18928753733634948, "sampling/importance_sampling_ratio/max": 1.8762676239013671, "sampling/importance_sampling_ratio/mean": 0.9999732494354248, "sampling/importance_sampling_ratio/min": 0.3732657790184021, "sampling/sampling_logp_difference/max": 1.1048237800598144, "sampling/sampling_logp_difference/mean": 0.011777786910533905, "step": 2365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1410.8, "completions/max_terminated_length": 1410.8, "completions/mean_length": 991.55, "completions/mean_terminated_length": 991.55, "completions/min_length": 743.2, "completions/min_terminated_length": 743.2, "entropy": 0.2386650711297989, "epoch": 2.7849588719153937, "frac_reward_zero_std": 0.5, "grad_norm": 0.5324167013168335, "learning_rate": 2.28495275018173e-07, "loss": -0.0004, "num_tokens": 319173860.0, "reward": 0.8041666746139526, "reward_std": 0.08360731303691864, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8041666746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.23412725180387498, "sampling/importance_sampling_ratio/max": 1.9656662702560426, "sampling/importance_sampling_ratio/mean": 0.9999577879905701, "sampling/importance_sampling_ratio/min": 0.4060625612735748, "sampling/sampling_logp_difference/max": 1.150398063659668, "sampling/sampling_logp_difference/mean": 0.012849260680377483, "step": 2370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1420.6, "completions/max_terminated_length": 1420.6, "completions/mean_length": 1008.525, "completions/mean_terminated_length": 1008.525, "completions/min_length": 721.2, "completions/min_terminated_length": 721.2, "entropy": 0.21963294446468354, "epoch": 2.790834312573443, "frac_reward_zero_std": 0.6, "grad_norm": 0.6863450407981873, "learning_rate": 2.2788950811727648e-07, "loss": 0.0011, "num_tokens": 319832220.0, "reward": 0.7083333492279053, "reward_std": 0.0958444319665432, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7083333492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.34578675627708433, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998778700828552, "sampling/importance_sampling_ratio/min": 0.2893254727125168, "sampling/sampling_logp_difference/max": 1.3027319669723512, "sampling/sampling_logp_difference/mean": 0.012292330339550971, "step": 2375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1497.6, "completions/max_terminated_length": 1497.6, "completions/mean_length": 1102.7625, "completions/mean_terminated_length": 1102.7625, "completions/min_length": 832.6, "completions/min_terminated_length": 832.6, "entropy": 0.2415948212146759, "epoch": 2.796709753231492, "frac_reward_zero_std": 0.45, "grad_norm": 1.0058459043502808, "learning_rate": 2.272837412163799e-07, "loss": 0.0055, "num_tokens": 320522224.0, "reward": 0.8317708373069763, "reward_std": 0.1039445236325264, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8317708373069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.2156577706336975, "sampling/importance_sampling_ratio/max": 1.988040065765381, "sampling/importance_sampling_ratio/mean": 0.9999471068382263, "sampling/importance_sampling_ratio/min": 0.3297655165195465, "sampling/sampling_logp_difference/max": 1.2059980869293212, "sampling/sampling_logp_difference/mean": 0.012710276432335377, "step": 2380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1576.6, "completions/max_terminated_length": 1576.6, "completions/mean_length": 1079.4, "completions/mean_terminated_length": 1079.4, "completions/min_length": 729.8, "completions/min_terminated_length": 729.8, "entropy": 0.24798661470413208, "epoch": 2.802585193889542, "frac_reward_zero_std": 0.55, "grad_norm": 0.6440450549125671, "learning_rate": 2.2667797431548338e-07, "loss": 0.0003, "num_tokens": 321186496.0, "reward": 0.7979166746139527, "reward_std": 0.073116684705019, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7979166865348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.2511927545070648, "sampling/importance_sampling_ratio/max": 1.9184043169021607, "sampling/importance_sampling_ratio/mean": 0.9999388098716736, "sampling/importance_sampling_ratio/min": 0.3903336763381958, "sampling/sampling_logp_difference/max": 0.9640900135040283, "sampling/sampling_logp_difference/mean": 0.012855613417923451, "step": 2385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1510.8, "completions/max_terminated_length": 1510.8, "completions/mean_length": 1024.43125, "completions/mean_terminated_length": 1024.43125, "completions/min_length": 795.2, "completions/min_terminated_length": 795.2, "entropy": 0.24828683137893676, "epoch": 2.808460634547591, "frac_reward_zero_std": 0.55, "grad_norm": 0.4777759909629822, "learning_rate": 2.2607220741458686e-07, "loss": 0.0043, "num_tokens": 321826602.0, "reward": 0.8531250357627869, "reward_std": 0.09283200576901436, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8531250357627869, "rewards/e2e_recall_precision_mixed_reward/std": 0.2510357365012169, "sampling/importance_sampling_ratio/max": 1.9304353952407838, "sampling/importance_sampling_ratio/mean": 1.0000390410423279, "sampling/importance_sampling_ratio/min": 0.4463606238365173, "sampling/sampling_logp_difference/max": 0.9297639846801757, "sampling/sampling_logp_difference/mean": 0.013255661353468895, "step": 2390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1558.8, "completions/max_terminated_length": 1558.8, "completions/mean_length": 1120.89375, "completions/mean_terminated_length": 1120.89375, "completions/min_length": 806.4, "completions/min_terminated_length": 806.4, "entropy": 0.24372271895408631, "epoch": 2.8143360752056403, "frac_reward_zero_std": 0.45, "grad_norm": 0.7181718945503235, "learning_rate": 2.2546644051369033e-07, "loss": 0.0033, "num_tokens": 322511752.0, "reward": 0.8140625119209289, "reward_std": 0.10003995001316071, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8140625119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.20417188704013825, "sampling/importance_sampling_ratio/max": 1.9468562364578248, "sampling/importance_sampling_ratio/mean": 0.9999962449073792, "sampling/importance_sampling_ratio/min": 0.3644874632358551, "sampling/sampling_logp_difference/max": 1.0251569509506226, "sampling/sampling_logp_difference/mean": 0.012897053360939026, "step": 2395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1386.2, "completions/max_terminated_length": 1386.2, "completions/mean_length": 1072.1, "completions/mean_terminated_length": 1072.1, "completions/min_length": 803.0, "completions/min_terminated_length": 803.0, "entropy": 0.23362427949905396, "epoch": 2.82021151586369, "frac_reward_zero_std": 0.8, "grad_norm": 0.3538127839565277, "learning_rate": 2.248606736127938e-07, "loss": 0.0005, "num_tokens": 323181272.0, "reward": 0.928697919845581, "reward_std": 0.033222814276814464, "rewards/e2e_recall_precision_mixed_reward/mean": 0.928697919845581, "rewards/e2e_recall_precision_mixed_reward/std": 0.12631498724222184, "sampling/importance_sampling_ratio/max": 1.8217214107513429, "sampling/importance_sampling_ratio/mean": 0.9999819278717041, "sampling/importance_sampling_ratio/min": 0.35696284770965575, "sampling/sampling_logp_difference/max": 1.0873285770416259, "sampling/sampling_logp_difference/mean": 0.012447315640747547, "step": 2400 }, { "epoch": 2.82021151586369, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1521.0, "eval_completions/max_terminated_length": 1521.0, "eval_completions/mean_length": 1084.185, "eval_completions/mean_terminated_length": 1084.185, "eval_completions/min_length": 813.92, "eval_completions/min_terminated_length": 813.92, "eval_entropy": 0.25041636466979983, "eval_frac_reward_zero_std": 0.62, "eval_loss": 0.0041066440753638744, "eval_num_tokens": 323181272.0, "eval_reward": 0.7530312585830689, "eval_reward_std": 0.0783327068388462, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7530312597751617, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.2980782437324524, "eval_runtime": 415.7277, "eval_samples_per_second": 0.241, "eval_sampling/importance_sampling_ratio/max": 1.9338807630538941, "eval_sampling/importance_sampling_ratio/mean": 1.0000006413459779, "eval_sampling/importance_sampling_ratio/min": 0.32921223118901255, "eval_sampling/sampling_logp_difference/max": 1.2984151482582091, "eval_sampling/sampling_logp_difference/mean": 0.013157993406057358, "eval_steps_per_second": 0.005, "step": 2400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1988.6, "completions/max_terminated_length": 1988.6, "completions/mean_length": 1143.284375, "completions/mean_terminated_length": 1143.284375, "completions/min_length": 789.2, "completions/min_terminated_length": 789.2, "entropy": 0.25749709010124205, "epoch": 2.8260869565217392, "frac_reward_zero_std": 0.45, "grad_norm": 0.5876041054725647, "learning_rate": 2.2425490671189725e-07, "loss": 0.0072, "num_tokens": 323895763.0, "reward": 0.795677101612091, "reward_std": 0.10413843393325806, "rewards/e2e_recall_precision_mixed_reward/mean": 0.795677101612091, "rewards/e2e_recall_precision_mixed_reward/std": 0.3029506832361221, "sampling/importance_sampling_ratio/max": 1.9795888662338257, "sampling/importance_sampling_ratio/mean": 1.0000039696693421, "sampling/importance_sampling_ratio/min": 0.3718868136405945, "sampling/sampling_logp_difference/max": 1.1153724908828735, "sampling/sampling_logp_difference/mean": 0.013322078436613084, "step": 2405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1622.4, "completions/max_terminated_length": 1622.4, "completions/mean_length": 1119.771875, "completions/mean_terminated_length": 1119.771875, "completions/min_length": 832.0, "completions/min_terminated_length": 832.0, "entropy": 0.2605073541402817, "epoch": 2.8319623971797885, "frac_reward_zero_std": 0.4, "grad_norm": 0.5048226118087769, "learning_rate": 2.2364913981100072e-07, "loss": -0.0009, "num_tokens": 324567498.0, "reward": 0.7794270873069763, "reward_std": 0.09945255517959595, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7794270992279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.29298948049545287, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999875545501709, "sampling/importance_sampling_ratio/min": 0.3045585220679641, "sampling/sampling_logp_difference/max": 1.7741375207901, "sampling/sampling_logp_difference/mean": 0.013549309782683849, "step": 2410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1547.8, "completions/max_terminated_length": 1547.8, "completions/mean_length": 1117.696875, "completions/mean_terminated_length": 1117.696875, "completions/min_length": 811.0, "completions/min_terminated_length": 811.0, "entropy": 0.2522917926311493, "epoch": 2.8378378378378377, "frac_reward_zero_std": 0.65, "grad_norm": 0.5383021831512451, "learning_rate": 2.230433729101042e-07, "loss": -0.003, "num_tokens": 325233065.0, "reward": 0.7588541746139527, "reward_std": 0.07800202667713166, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7588541746139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.3657463490962982, "sampling/importance_sampling_ratio/max": 1.9869714260101319, "sampling/importance_sampling_ratio/mean": 0.9999585270881652, "sampling/importance_sampling_ratio/min": 0.33153983354568484, "sampling/sampling_logp_difference/max": 1.1554431915283203, "sampling/sampling_logp_difference/mean": 0.013245697319507598, "step": 2415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1408.0, "completions/max_terminated_length": 1408.0, "completions/mean_length": 1086.828125, "completions/mean_terminated_length": 1086.828125, "completions/min_length": 823.2, "completions/min_terminated_length": 823.2, "entropy": 0.24877199530601501, "epoch": 2.843713278495887, "frac_reward_zero_std": 0.55, "grad_norm": 0.7546255588531494, "learning_rate": 2.2243760600920764e-07, "loss": 0.0012, "num_tokens": 325882322.0, "reward": 0.7520833492279053, "reward_std": 0.08643076345324516, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7520833492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.2982165992259979, "sampling/importance_sampling_ratio/max": 1.9108956575393676, "sampling/importance_sampling_ratio/mean": 0.9998527646064759, "sampling/importance_sampling_ratio/min": 0.34085713028907777, "sampling/sampling_logp_difference/max": 1.1262581586837768, "sampling/sampling_logp_difference/mean": 0.01293862983584404, "step": 2420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1657.4, "completions/max_terminated_length": 1657.4, "completions/mean_length": 1171.2, "completions/mean_terminated_length": 1171.2, "completions/min_length": 850.6, "completions/min_terminated_length": 850.6, "entropy": 0.24384477734565735, "epoch": 2.8495887191539366, "frac_reward_zero_std": 0.7, "grad_norm": 0.0, "learning_rate": 2.218318391083111e-07, "loss": -0.002, "num_tokens": 326571378.0, "reward": 0.717187511920929, "reward_std": 0.06420539878308773, "rewards/e2e_recall_precision_mixed_reward/mean": 0.717187511920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.334746053814888, "sampling/importance_sampling_ratio/max": 1.934575629234314, "sampling/importance_sampling_ratio/mean": 1.0000715255737305, "sampling/importance_sampling_ratio/min": 0.3990495681762695, "sampling/sampling_logp_difference/max": 1.0094342708587647, "sampling/sampling_logp_difference/mean": 0.012629887461662293, "step": 2425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1639.8, "completions/max_terminated_length": 1639.8, "completions/mean_length": 1121.29375, "completions/mean_terminated_length": 1121.29375, "completions/min_length": 830.0, "completions/min_terminated_length": 830.0, "entropy": 0.24283508062362671, "epoch": 2.855464159811986, "frac_reward_zero_std": 0.7, "grad_norm": 0.46253702044487, "learning_rate": 2.2122607220741457e-07, "loss": 0.0014, "num_tokens": 327278640.0, "reward": 0.7898437857627869, "reward_std": 0.0780556008219719, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7898437857627869, "rewards/e2e_recall_precision_mixed_reward/std": 0.2904230237007141, "sampling/importance_sampling_ratio/max": 1.982173752784729, "sampling/importance_sampling_ratio/mean": 1.00010347366333, "sampling/importance_sampling_ratio/min": 0.3133694648742676, "sampling/sampling_logp_difference/max": 1.2216663122177125, "sampling/sampling_logp_difference/mean": 0.01267168838530779, "step": 2430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1457.4, "completions/max_terminated_length": 1457.4, "completions/mean_length": 1109.309375, "completions/mean_terminated_length": 1109.309375, "completions/min_length": 809.2, "completions/min_terminated_length": 809.2, "entropy": 0.2510025084018707, "epoch": 2.861339600470035, "frac_reward_zero_std": 0.55, "grad_norm": 0.6064744591712952, "learning_rate": 2.2062030530651803e-07, "loss": 0.0057, "num_tokens": 327965651.0, "reward": 0.85703125, "reward_std": 0.10422002114355564, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8570312619209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.24693194925785064, "sampling/importance_sampling_ratio/max": 1.9674418687820434, "sampling/importance_sampling_ratio/mean": 1.0000630259513854, "sampling/importance_sampling_ratio/min": 0.3229108899831772, "sampling/sampling_logp_difference/max": 1.1470834374427796, "sampling/sampling_logp_difference/mean": 0.012952681444585324, "step": 2435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1498.8, "completions/max_terminated_length": 1498.8, "completions/mean_length": 1084.78125, "completions/mean_terminated_length": 1084.78125, "completions/min_length": 797.0, "completions/min_terminated_length": 797.0, "entropy": 0.25026600658893583, "epoch": 2.867215041128085, "frac_reward_zero_std": 0.6, "grad_norm": 0.4441192150115967, "learning_rate": 2.2001453840562152e-07, "loss": 0.0012, "num_tokens": 328652029.0, "reward": 0.8427083373069764, "reward_std": 0.08037955164909363, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8427083373069764, "rewards/e2e_recall_precision_mixed_reward/std": 0.2896536558866501, "sampling/importance_sampling_ratio/max": 1.9711972713470458, "sampling/importance_sampling_ratio/mean": 0.9999094724655151, "sampling/importance_sampling_ratio/min": 0.3898857295513153, "sampling/sampling_logp_difference/max": 1.0192480564117432, "sampling/sampling_logp_difference/mean": 0.013036524504423141, "step": 2440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 1677.2, "completions/max_terminated_length": 1659.0, "completions/mean_length": 1131.771875, "completions/mean_terminated_length": 1123.9448486328124, "completions/min_length": 796.0, "completions/min_terminated_length": 796.0, "entropy": 0.2522565394639969, "epoch": 2.873090481786134, "frac_reward_zero_std": 0.65, "grad_norm": 0.49075251817703247, "learning_rate": 2.1940877150472498e-07, "loss": -0.0113, "num_tokens": 329330124.0, "reward": 0.8895833492279053, "reward_std": 0.07341724410653114, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8895833492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.2047812134027481, "sampling/importance_sampling_ratio/max": 1.9501996517181397, "sampling/importance_sampling_ratio/mean": 0.9999414086341858, "sampling/importance_sampling_ratio/min": 0.28733372688293457, "sampling/sampling_logp_difference/max": 1.2867217302322387, "sampling/sampling_logp_difference/mean": 0.013158978708088399, "step": 2445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1515.2, "completions/max_terminated_length": 1515.2, "completions/mean_length": 1143.65625, "completions/mean_terminated_length": 1143.65625, "completions/min_length": 856.6, "completions/min_terminated_length": 856.6, "entropy": 0.26105785369873047, "epoch": 2.8789659224441833, "frac_reward_zero_std": 0.5, "grad_norm": 0.6251075863838196, "learning_rate": 2.1880300460382845e-07, "loss": -0.001, "num_tokens": 330017470.0, "reward": 0.8720833420753479, "reward_std": 0.11039019525051116, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8720833420753479, "rewards/e2e_recall_precision_mixed_reward/std": 0.21222806125879287, "sampling/importance_sampling_ratio/max": 1.8873302221298218, "sampling/importance_sampling_ratio/mean": 1.0000155210494994, "sampling/importance_sampling_ratio/min": 0.3039222886785865, "sampling/sampling_logp_difference/max": 1.8534332752227782, "sampling/sampling_logp_difference/mean": 0.013153030537068844, "step": 2450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 1747.0, "completions/max_terminated_length": 1738.6, "completions/mean_length": 1155.228125, "completions/mean_terminated_length": 1139.845849609375, "completions/min_length": 807.0, "completions/min_terminated_length": 807.0, "entropy": 0.2519680917263031, "epoch": 2.8848413631022325, "frac_reward_zero_std": 0.6, "grad_norm": 0.6380645036697388, "learning_rate": 2.181972377029319e-07, "loss": -0.0153, "num_tokens": 330704599.0, "reward": 0.8036458492279053, "reward_std": 0.07958495393395423, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8036458492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.24552056342363357, "sampling/importance_sampling_ratio/max": 1.8710920572280885, "sampling/importance_sampling_ratio/mean": 1.0000233888626098, "sampling/importance_sampling_ratio/min": 0.3811956226825714, "sampling/sampling_logp_difference/max": 1.0013225317001342, "sampling/sampling_logp_difference/mean": 0.013104490749537945, "step": 2455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1977.6, "completions/max_terminated_length": 1977.6, "completions/mean_length": 1219.80625, "completions/mean_terminated_length": 1219.80625, "completions/min_length": 826.2, "completions/min_terminated_length": 826.2, "entropy": 0.2533602148294449, "epoch": 2.890716803760282, "frac_reward_zero_std": 0.5, "grad_norm": 0.6828557252883911, "learning_rate": 2.1759147080203534e-07, "loss": 0.0033, "num_tokens": 331419417.0, "reward": 0.8088541865348816, "reward_std": 0.07977185398340225, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8088541865348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.3063382938504219, "sampling/importance_sampling_ratio/max": 1.9880699634552002, "sampling/importance_sampling_ratio/mean": 1.0000319957733155, "sampling/importance_sampling_ratio/min": 0.3192105397582054, "sampling/sampling_logp_difference/max": 1.4089489936828614, "sampling/sampling_logp_difference/mean": 0.01299858596175909, "step": 2460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1555.4, "completions/max_terminated_length": 1555.4, "completions/mean_length": 1136.778125, "completions/mean_terminated_length": 1136.778125, "completions/min_length": 876.0, "completions/min_terminated_length": 876.0, "entropy": 0.2615731358528137, "epoch": 2.8965922444183314, "frac_reward_zero_std": 0.6, "grad_norm": 0.5761390328407288, "learning_rate": 2.1698570390113883e-07, "loss": -0.0011, "num_tokens": 332091554.0, "reward": 0.8898437738418579, "reward_std": 0.07263861447572709, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8898437738418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.19870344996452333, "sampling/importance_sampling_ratio/max": 1.9780163526535035, "sampling/importance_sampling_ratio/mean": 0.9998313546180725, "sampling/importance_sampling_ratio/min": 0.4730712652206421, "sampling/sampling_logp_difference/max": 0.795024037361145, "sampling/sampling_logp_difference/mean": 0.013214756362140178, "step": 2465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1676.6, "completions/max_terminated_length": 1676.6, "completions/mean_length": 1146.296875, "completions/mean_terminated_length": 1146.296875, "completions/min_length": 832.2, "completions/min_terminated_length": 832.2, "entropy": 0.2526620090007782, "epoch": 2.9024676850763806, "frac_reward_zero_std": 0.65, "grad_norm": 0.6532995104789734, "learning_rate": 2.163799370002423e-07, "loss": 0.0031, "num_tokens": 332762497.0, "reward": 0.9223958492279053, "reward_std": 0.06299638226628304, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9223958492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.17120740562677383, "sampling/importance_sampling_ratio/max": 1.9535242080688477, "sampling/importance_sampling_ratio/mean": 1.0000123977661133, "sampling/importance_sampling_ratio/min": 0.3405495584011078, "sampling/sampling_logp_difference/max": 1.081800150871277, "sampling/sampling_logp_difference/mean": 0.012900187820196151, "step": 2470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1644.6, "completions/max_terminated_length": 1644.6, "completions/mean_length": 1169.665625, "completions/mean_terminated_length": 1169.665625, "completions/min_length": 833.6, "completions/min_terminated_length": 833.6, "entropy": 0.26975159645080565, "epoch": 2.9083431257344303, "frac_reward_zero_std": 0.75, "grad_norm": 0.599644660949707, "learning_rate": 2.1577417009934576e-07, "loss": -0.0003, "num_tokens": 333444758.0, "reward": 0.8614583492279053, "reward_std": 0.059494443237781525, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8614583492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.19280930310487748, "sampling/importance_sampling_ratio/max": 1.9035891771316529, "sampling/importance_sampling_ratio/mean": 1.0000111937522889, "sampling/importance_sampling_ratio/min": 0.3698125422000885, "sampling/sampling_logp_difference/max": 1.0941879034042359, "sampling/sampling_logp_difference/mean": 0.0135704992339015, "step": 2475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1623.8, "completions/max_terminated_length": 1623.8, "completions/mean_length": 1155.8375, "completions/mean_terminated_length": 1155.8375, "completions/min_length": 916.2, "completions/min_terminated_length": 916.2, "entropy": 0.26333553791046144, "epoch": 2.9142185663924796, "frac_reward_zero_std": 0.45, "grad_norm": 0.5579608082771301, "learning_rate": 2.1516840319844922e-07, "loss": 0.0036, "num_tokens": 334145586.0, "reward": 0.8338541865348816, "reward_std": 0.10666598826646805, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8338541865348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.27408269345760344, "sampling/importance_sampling_ratio/max": 1.96576726436615, "sampling/importance_sampling_ratio/mean": 1.0000550985336303, "sampling/importance_sampling_ratio/min": 0.393053674697876, "sampling/sampling_logp_difference/max": 1.2964228630065917, "sampling/sampling_logp_difference/mean": 0.013295540772378444, "step": 2480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1590.0, "completions/max_terminated_length": 1590.0, "completions/mean_length": 1181.290625, "completions/mean_terminated_length": 1181.290625, "completions/min_length": 885.2, "completions/min_terminated_length": 885.2, "entropy": 0.2766443967819214, "epoch": 2.920094007050529, "frac_reward_zero_std": 0.7, "grad_norm": 0.44202736020088196, "learning_rate": 2.1456263629755269e-07, "loss": 0.0009, "num_tokens": 334873375.0, "reward": 0.7684895992279053, "reward_std": 0.05524676963686943, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7684895992279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.25875370651483537, "sampling/importance_sampling_ratio/max": 1.9369422912597656, "sampling/importance_sampling_ratio/mean": 1.000017511844635, "sampling/importance_sampling_ratio/min": 0.2905049294233322, "sampling/sampling_logp_difference/max": 1.423995351791382, "sampling/sampling_logp_difference/mean": 0.014015245065093041, "step": 2485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009375, "completions/max_length": 1726.0, "completions/max_terminated_length": 1652.6, "completions/mean_length": 1200.453125, "completions/mean_terminated_length": 1190.051318359375, "completions/min_length": 817.0, "completions/min_terminated_length": 817.0, "entropy": 0.2852647066116333, "epoch": 2.925969447708578, "frac_reward_zero_std": 0.5, "grad_norm": 0.6844410300254822, "learning_rate": 2.1395686939665617e-07, "loss": 0.0034, "num_tokens": 335560532.0, "reward": 0.6661458373069763, "reward_std": 0.09234302788972855, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6661458373069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.310054212808609, "sampling/importance_sampling_ratio/max": 1.9192859172821044, "sampling/importance_sampling_ratio/mean": 1.0000102877616883, "sampling/importance_sampling_ratio/min": 0.35606696009635924, "sampling/sampling_logp_difference/max": 1.0444335222244263, "sampling/sampling_logp_difference/mean": 0.01423647254705429, "step": 2490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1445.4, "completions/max_terminated_length": 1445.4, "completions/mean_length": 1093.390625, "completions/mean_terminated_length": 1093.390625, "completions/min_length": 831.8, "completions/min_terminated_length": 831.8, "entropy": 0.2477249264717102, "epoch": 2.9318448883666273, "frac_reward_zero_std": 0.7, "grad_norm": 0.6126586198806763, "learning_rate": 2.1335110249575964e-07, "loss": 0.0008, "num_tokens": 336230017.0, "reward": 0.881250011920929, "reward_std": 0.0689006544649601, "rewards/e2e_recall_precision_mixed_reward/mean": 0.881250011920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.21311764121055604, "sampling/importance_sampling_ratio/max": 1.9315643072128297, "sampling/importance_sampling_ratio/mean": 0.9998762845993042, "sampling/importance_sampling_ratio/min": 0.4095799148082733, "sampling/sampling_logp_difference/max": 0.9346026420593262, "sampling/sampling_logp_difference/mean": 0.01287180297076702, "step": 2495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1656.2, "completions/max_terminated_length": 1656.2, "completions/mean_length": 1153.73125, "completions/mean_terminated_length": 1153.73125, "completions/min_length": 744.6, "completions/min_terminated_length": 744.6, "entropy": 0.25892970263957976, "epoch": 2.937720329024677, "frac_reward_zero_std": 0.45, "grad_norm": 0.699809193611145, "learning_rate": 2.1274533559486307e-07, "loss": -0.0013, "num_tokens": 336941419.0, "reward": 0.8398437738418579, "reward_std": 0.13118309378623963, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8398437738418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.24315484762191772, "sampling/importance_sampling_ratio/max": 1.9509620666503906, "sampling/importance_sampling_ratio/mean": 0.9998958230018615, "sampling/importance_sampling_ratio/min": 0.3090625017881393, "sampling/sampling_logp_difference/max": 1.36893892288208, "sampling/sampling_logp_difference/mean": 0.013196432776749135, "step": 2500 }, { "epoch": 2.937720329024677, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1615.92, "eval_completions/max_terminated_length": 1615.92, "eval_completions/mean_length": 1160.8625, "eval_completions/mean_terminated_length": 1160.8625, "eval_completions/min_length": 864.4, "eval_completions/min_terminated_length": 864.4, "eval_entropy": 0.2651230132579803, "eval_frac_reward_zero_std": 0.57, "eval_loss": 0.003797011449933052, "eval_num_tokens": 336941419.0, "eval_reward": 0.7690000116825104, "eval_reward_std": 0.08354492157697678, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7690000140666962, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.2940467694401741, "eval_runtime": 438.4961, "eval_samples_per_second": 0.228, "eval_sampling/importance_sampling_ratio/max": 1.9412076902389526, "eval_sampling/importance_sampling_ratio/mean": 1.0000221300125123, "eval_sampling/importance_sampling_ratio/min": 0.35337373718619347, "eval_sampling/sampling_logp_difference/max": 1.1907857728004456, "eval_sampling/sampling_logp_difference/mean": 0.013423861749470235, "eval_steps_per_second": 0.005, "step": 2500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1678.8, "completions/max_terminated_length": 1678.8, "completions/mean_length": 1160.890625, "completions/mean_terminated_length": 1160.890625, "completions/min_length": 871.8, "completions/min_terminated_length": 871.8, "entropy": 0.2552634745836258, "epoch": 2.943595769682726, "frac_reward_zero_std": 0.7, "grad_norm": 0.43133029341697693, "learning_rate": 2.1213956869396654e-07, "loss": 0.0021, "num_tokens": 337630280.0, "reward": 0.7451562523841858, "reward_std": 0.058319534920156, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7451562523841858, "rewards/e2e_recall_precision_mixed_reward/std": 0.34460012018680575, "sampling/importance_sampling_ratio/max": 1.9622292518615723, "sampling/importance_sampling_ratio/mean": 0.9999729037284851, "sampling/importance_sampling_ratio/min": 0.36249165832996366, "sampling/sampling_logp_difference/max": 1.1442147254943849, "sampling/sampling_logp_difference/mean": 0.01324941124767065, "step": 2505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1671.0, "completions/max_terminated_length": 1671.0, "completions/mean_length": 1157.425, "completions/mean_terminated_length": 1157.425, "completions/min_length": 815.6, "completions/min_terminated_length": 815.6, "entropy": 0.26674684882164, "epoch": 2.9494712103407754, "frac_reward_zero_std": 0.6, "grad_norm": 0.6723577380180359, "learning_rate": 2.1153380179307e-07, "loss": -0.0054, "num_tokens": 338330224.0, "reward": 0.8408854484558106, "reward_std": 0.06934458911418914, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8408854484558106, "rewards/e2e_recall_precision_mixed_reward/std": 0.23293874263763428, "sampling/importance_sampling_ratio/max": 1.9028401374816895, "sampling/importance_sampling_ratio/mean": 1.0001383185386659, "sampling/importance_sampling_ratio/min": 0.38664844036102297, "sampling/sampling_logp_difference/max": 0.9693643093109131, "sampling/sampling_logp_difference/mean": 0.01352162528783083, "step": 2510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1909.0, "completions/max_terminated_length": 1860.6, "completions/mean_length": 1282.859375, "completions/mean_terminated_length": 1279.4164794921876, "completions/min_length": 871.6, "completions/min_terminated_length": 871.6, "entropy": 0.2674229830503464, "epoch": 2.955346650998825, "frac_reward_zero_std": 0.45, "grad_norm": 0.8589381575584412, "learning_rate": 2.109280348921735e-07, "loss": -0.0084, "num_tokens": 339039711.0, "reward": 0.7507812738418579, "reward_std": 0.12396226227283477, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7507812738418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.33741641640663145, "sampling/importance_sampling_ratio/max": 1.9285744667053222, "sampling/importance_sampling_ratio/mean": 1.0001076936721802, "sampling/importance_sampling_ratio/min": 0.3456153243780136, "sampling/sampling_logp_difference/max": 1.2050754070281982, "sampling/sampling_logp_difference/mean": 0.013429709896445274, "step": 2515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1685.0, "completions/max_terminated_length": 1685.0, "completions/mean_length": 1150.846875, "completions/mean_terminated_length": 1150.846875, "completions/min_length": 829.2, "completions/min_terminated_length": 829.2, "entropy": 0.2486409604549408, "epoch": 2.9612220916568743, "frac_reward_zero_std": 0.7, "grad_norm": 0.5134599804878235, "learning_rate": 2.1032226799127695e-07, "loss": 0.0046, "num_tokens": 339715694.0, "reward": 0.840625011920929, "reward_std": 0.045786444842815396, "rewards/e2e_recall_precision_mixed_reward/mean": 0.840625011920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.2133595198392868, "sampling/importance_sampling_ratio/max": 1.9325678586959838, "sampling/importance_sampling_ratio/mean": 1.000019907951355, "sampling/importance_sampling_ratio/min": 0.41126868724822996, "sampling/sampling_logp_difference/max": 0.9507953882217407, "sampling/sampling_logp_difference/mean": 0.012637078016996383, "step": 2520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1533.4, "completions/max_terminated_length": 1533.4, "completions/mean_length": 1150.565625, "completions/mean_terminated_length": 1150.565625, "completions/min_length": 922.6, "completions/min_terminated_length": 922.6, "entropy": 0.2696438133716583, "epoch": 2.9670975323149236, "frac_reward_zero_std": 0.5, "grad_norm": 0.6207783222198486, "learning_rate": 2.0971650109038041e-07, "loss": 0.0028, "num_tokens": 340408995.0, "reward": 0.8907291650772095, "reward_std": 0.09921484291553498, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8907291650772095, "rewards/e2e_recall_precision_mixed_reward/std": 0.19189145267009736, "sampling/importance_sampling_ratio/max": 1.9334681987762452, "sampling/importance_sampling_ratio/mean": 0.9999927043914795, "sampling/importance_sampling_ratio/min": 0.31438209041953086, "sampling/sampling_logp_difference/max": 1.3856622934341432, "sampling/sampling_logp_difference/mean": 0.013535279594361783, "step": 2525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1737.8, "completions/max_terminated_length": 1737.8, "completions/mean_length": 1185.38125, "completions/mean_terminated_length": 1185.38125, "completions/min_length": 875.0, "completions/min_terminated_length": 875.0, "entropy": 0.2622179836034775, "epoch": 2.972972972972973, "frac_reward_zero_std": 0.7, "grad_norm": 0.0, "learning_rate": 2.0911073418948388e-07, "loss": 0.0044, "num_tokens": 341100077.0, "reward": 0.9062500119209289, "reward_std": 0.07759927660226822, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9062500119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.19565635025501252, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000108242034913, "sampling/importance_sampling_ratio/min": 0.3265382140874863, "sampling/sampling_logp_difference/max": 1.1886168718338013, "sampling/sampling_logp_difference/mean": 0.013404231891036034, "step": 2530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1551.0, "completions/max_terminated_length": 1551.0, "completions/mean_length": 1126.5625, "completions/mean_terminated_length": 1126.5625, "completions/min_length": 860.8, "completions/min_terminated_length": 860.8, "entropy": 0.2481956660747528, "epoch": 2.9788484136310225, "frac_reward_zero_std": 0.65, "grad_norm": 0.4216060936450958, "learning_rate": 2.0850496728858734e-07, "loss": -0.0042, "num_tokens": 341751889.0, "reward": 0.9169270992279053, "reward_std": 0.07064950466156006, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9169270992279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.18254780471324922, "sampling/importance_sampling_ratio/max": 1.964564847946167, "sampling/importance_sampling_ratio/mean": 1.0000295996665955, "sampling/importance_sampling_ratio/min": 0.3919122636318207, "sampling/sampling_logp_difference/max": 1.0612175703048705, "sampling/sampling_logp_difference/mean": 0.012986170686781406, "step": 2535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1424.6, "completions/max_terminated_length": 1424.6, "completions/mean_length": 1110.55625, "completions/mean_terminated_length": 1110.55625, "completions/min_length": 827.0, "completions/min_terminated_length": 827.0, "entropy": 0.25095831155776976, "epoch": 2.9847238542890717, "frac_reward_zero_std": 0.7, "grad_norm": 0.42990821599960327, "learning_rate": 2.0789920038769083e-07, "loss": -0.0017, "num_tokens": 342441411.0, "reward": 0.9187500238418579, "reward_std": 0.07234707698225976, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9187500238418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.1585363283753395, "sampling/importance_sampling_ratio/max": 1.9233515501022338, "sampling/importance_sampling_ratio/mean": 0.9999359726905823, "sampling/importance_sampling_ratio/min": 0.31806144714355467, "sampling/sampling_logp_difference/max": 1.177621603012085, "sampling/sampling_logp_difference/mean": 0.013096613995730876, "step": 2540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1723.8, "completions/max_terminated_length": 1723.8, "completions/mean_length": 1173.43125, "completions/mean_terminated_length": 1173.43125, "completions/min_length": 891.2, "completions/min_terminated_length": 891.2, "entropy": 0.245199453830719, "epoch": 2.990599294947121, "frac_reward_zero_std": 0.6, "grad_norm": 0.4453013837337494, "learning_rate": 2.0729343348679427e-07, "loss": 0.0001, "num_tokens": 343163101.0, "reward": 0.8479166746139526, "reward_std": 0.07755421325564385, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8479166746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.233888578414917, "sampling/importance_sampling_ratio/max": 1.944665217399597, "sampling/importance_sampling_ratio/mean": 0.9999553084373474, "sampling/importance_sampling_ratio/min": 0.2975509911775589, "sampling/sampling_logp_difference/max": 1.2434906959533691, "sampling/sampling_logp_difference/mean": 0.012770450860261916, "step": 2545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1660.2, "completions/max_terminated_length": 1660.2, "completions/mean_length": 1165.56875, "completions/mean_terminated_length": 1165.56875, "completions/min_length": 847.8, "completions/min_terminated_length": 847.8, "entropy": 0.24514368772506714, "epoch": 2.9964747356051706, "frac_reward_zero_std": 0.6, "grad_norm": 0.4213135242462158, "learning_rate": 2.0668766658589773e-07, "loss": -0.0007, "num_tokens": 343836915.0, "reward": 0.801562511920929, "reward_std": 0.07066599875688553, "rewards/e2e_recall_precision_mixed_reward/mean": 0.801562511920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.27490745335817335, "sampling/importance_sampling_ratio/max": 1.9496089696884156, "sampling/importance_sampling_ratio/mean": 1.0000056385993958, "sampling/importance_sampling_ratio/min": 0.3634680390357971, "sampling/sampling_logp_difference/max": 1.1743841171264648, "sampling/sampling_logp_difference/mean": 0.012745716236531734, "step": 2550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1566.8, "completions/max_terminated_length": 1566.8, "completions/mean_length": 1120.646875, "completions/mean_terminated_length": 1120.646875, "completions/min_length": 823.2, "completions/min_terminated_length": 823.2, "entropy": 0.23687808513641356, "epoch": 3.00235017626322, "frac_reward_zero_std": 0.75, "grad_norm": 0.0, "learning_rate": 2.060818996850012e-07, "loss": 0.0039, "num_tokens": 344543858.0, "reward": 0.7984375119209289, "reward_std": 0.07464845180511474, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7984375119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.3012055605649948, "sampling/importance_sampling_ratio/max": 1.9975390911102295, "sampling/importance_sampling_ratio/mean": 1.0000576019287108, "sampling/importance_sampling_ratio/min": 0.38463932275772095, "sampling/sampling_logp_difference/max": 1.1026000499725341, "sampling/sampling_logp_difference/mean": 0.012608602643013, "step": 2555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1455.6, "completions/max_terminated_length": 1455.6, "completions/mean_length": 1078.309375, "completions/mean_terminated_length": 1078.309375, "completions/min_length": 782.2, "completions/min_terminated_length": 782.2, "entropy": 0.22939117550849913, "epoch": 3.008225616921269, "frac_reward_zero_std": 0.55, "grad_norm": 0.5061334371566772, "learning_rate": 2.0547613278410465e-07, "loss": -0.0049, "num_tokens": 345197557.0, "reward": 0.8630208373069763, "reward_std": 0.06913707032799721, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8630208373069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.2338823139667511, "sampling/importance_sampling_ratio/max": 1.964646315574646, "sampling/importance_sampling_ratio/mean": 0.9999775528907776, "sampling/importance_sampling_ratio/min": 0.32452565338809547, "sampling/sampling_logp_difference/max": 4.082516860961914, "sampling/sampling_logp_difference/mean": 0.012144268304109574, "step": 2560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1498.0, "completions/max_terminated_length": 1498.0, "completions/mean_length": 1094.321875, "completions/mean_terminated_length": 1094.321875, "completions/min_length": 773.6, "completions/min_terminated_length": 773.6, "entropy": 0.23587908148765563, "epoch": 3.0141010575793183, "frac_reward_zero_std": 0.65, "grad_norm": 0.568652868270874, "learning_rate": 2.0487036588320814e-07, "loss": 0.0004, "num_tokens": 345907900.0, "reward": 0.8179687619209289, "reward_std": 0.07180419340729713, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8179687619209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.2588413327932358, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000184774398804, "sampling/importance_sampling_ratio/min": 0.22604906580163514, "sampling/sampling_logp_difference/max": 3.168667030334473, "sampling/sampling_logp_difference/mean": 0.012671238370239734, "step": 2565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1898.6, "completions/max_terminated_length": 1898.6, "completions/mean_length": 1210.071875, "completions/mean_terminated_length": 1210.071875, "completions/min_length": 864.6, "completions/min_terminated_length": 864.6, "entropy": 0.2588326156139374, "epoch": 3.0199764982373676, "frac_reward_zero_std": 0.4, "grad_norm": 0.80168217420578, "learning_rate": 2.042645989823116e-07, "loss": 0.0028, "num_tokens": 346622019.0, "reward": 0.7994791746139527, "reward_std": 0.1147149682044983, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7994791746139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.29545135200023653, "sampling/importance_sampling_ratio/max": 1.985557770729065, "sampling/importance_sampling_ratio/mean": 1.0000051379203796, "sampling/importance_sampling_ratio/min": 0.32981371879577637, "sampling/sampling_logp_difference/max": 1.1296478509902954, "sampling/sampling_logp_difference/mean": 0.013445395790040494, "step": 2570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1761.2, "completions/max_terminated_length": 1742.0, "completions/mean_length": 1148.53125, "completions/mean_terminated_length": 1144.4665771484374, "completions/min_length": 832.6, "completions/min_terminated_length": 832.6, "entropy": 0.26161502599716185, "epoch": 3.0258519388954173, "frac_reward_zero_std": 0.5, "grad_norm": 0.8263538479804993, "learning_rate": 2.0365883208141507e-07, "loss": -0.0032, "num_tokens": 347319897.0, "reward": 0.861718761920929, "reward_std": 0.08996040225028992, "rewards/e2e_recall_precision_mixed_reward/mean": 0.861718761920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.22977930754423143, "sampling/importance_sampling_ratio/max": 1.9648932695388794, "sampling/importance_sampling_ratio/mean": 1.000153088569641, "sampling/importance_sampling_ratio/min": 0.30643958374857905, "sampling/sampling_logp_difference/max": 1.5094314098358155, "sampling/sampling_logp_difference/mean": 0.013726024515926838, "step": 2575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1558.0, "completions/max_terminated_length": 1558.0, "completions/mean_length": 1160.00625, "completions/mean_terminated_length": 1160.00625, "completions/min_length": 909.2, "completions/min_terminated_length": 909.2, "entropy": 0.248623988032341, "epoch": 3.0317273795534665, "frac_reward_zero_std": 0.65, "grad_norm": 0.0, "learning_rate": 2.0305306518051853e-07, "loss": 0.0021, "num_tokens": 347988235.0, "reward": 0.8042708396911621, "reward_std": 0.06089485287666321, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8042708396911621, "rewards/e2e_recall_precision_mixed_reward/std": 0.2598120987415314, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000272750854493, "sampling/importance_sampling_ratio/min": 0.29968391843140124, "sampling/sampling_logp_difference/max": 1.8056718349456786, "sampling/sampling_logp_difference/mean": 0.01313832849264145, "step": 2580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1599.6, "completions/max_terminated_length": 1599.6, "completions/mean_length": 1228.05625, "completions/mean_terminated_length": 1228.05625, "completions/min_length": 894.6, "completions/min_terminated_length": 894.6, "entropy": 0.25133021771907804, "epoch": 3.0376028202115157, "frac_reward_zero_std": 0.45, "grad_norm": 0.6442340612411499, "learning_rate": 2.0244729827962197e-07, "loss": 0.0001, "num_tokens": 348676301.0, "reward": 0.8003125071525574, "reward_std": 0.11148321777582168, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8003125190734863, "rewards/e2e_recall_precision_mixed_reward/std": 0.27468371093273164, "sampling/importance_sampling_ratio/max": 1.8601706743240356, "sampling/importance_sampling_ratio/mean": 0.9999647378921509, "sampling/importance_sampling_ratio/min": 0.38169229626655576, "sampling/sampling_logp_difference/max": 1.0052381753921509, "sampling/sampling_logp_difference/mean": 0.012974300980567932, "step": 2585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1565.2, "completions/max_terminated_length": 1565.2, "completions/mean_length": 1092.753125, "completions/mean_terminated_length": 1092.753125, "completions/min_length": 796.8, "completions/min_terminated_length": 796.8, "entropy": 0.25149821043014525, "epoch": 3.0434782608695654, "frac_reward_zero_std": 0.6, "grad_norm": 0.39658764004707336, "learning_rate": 2.0184153137872546e-07, "loss": -0.0003, "num_tokens": 349333486.0, "reward": 0.8057291865348816, "reward_std": 0.07588431015610694, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8057291865348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.2633956283330917, "sampling/importance_sampling_ratio/max": 1.9642632484436036, "sampling/importance_sampling_ratio/mean": 1.0000943183898925, "sampling/importance_sampling_ratio/min": 0.43040544986724855, "sampling/sampling_logp_difference/max": 1.0892863035202027, "sampling/sampling_logp_difference/mean": 0.013009889796376229, "step": 2590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1633.4, "completions/max_terminated_length": 1633.4, "completions/mean_length": 1181.28125, "completions/mean_terminated_length": 1181.28125, "completions/min_length": 899.2, "completions/min_terminated_length": 899.2, "entropy": 0.24790201783180238, "epoch": 3.0493537015276146, "frac_reward_zero_std": 0.55, "grad_norm": 0.0, "learning_rate": 2.0123576447782892e-07, "loss": -0.0011, "num_tokens": 350027192.0, "reward": 0.901562511920929, "reward_std": 0.09831328690052032, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9015625238418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.18027370274066926, "sampling/importance_sampling_ratio/max": 1.9673321008682252, "sampling/importance_sampling_ratio/mean": 1.0000709176063538, "sampling/importance_sampling_ratio/min": 0.36134466230869294, "sampling/sampling_logp_difference/max": 1.0996488094329835, "sampling/sampling_logp_difference/mean": 0.012829454429447652, "step": 2595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1514.4, "completions/max_terminated_length": 1514.4, "completions/mean_length": 1094.546875, "completions/mean_terminated_length": 1094.546875, "completions/min_length": 762.0, "completions/min_terminated_length": 762.0, "entropy": 0.24174076914787293, "epoch": 3.055229142185664, "frac_reward_zero_std": 0.55, "grad_norm": 0.3839050829410553, "learning_rate": 2.0062999757693238e-07, "loss": -0.0012, "num_tokens": 350717863.0, "reward": 0.8422916889190674, "reward_std": 0.10253577679395676, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8422916889190674, "rewards/e2e_recall_precision_mixed_reward/std": 0.20914290547370912, "sampling/importance_sampling_ratio/max": 1.9016016483306886, "sampling/importance_sampling_ratio/mean": 0.9999129056930542, "sampling/importance_sampling_ratio/min": 0.3612874448299408, "sampling/sampling_logp_difference/max": 1.1594652891159059, "sampling/sampling_logp_difference/mean": 0.013087780401110648, "step": 2600 }, { "epoch": 3.055229142185664, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1527.92, "eval_completions/max_terminated_length": 1527.92, "eval_completions/mean_length": 1115.324375, "eval_completions/mean_terminated_length": 1115.324375, "eval_completions/min_length": 830.56, "eval_completions/min_terminated_length": 830.56, "eval_entropy": 0.24745357692241668, "eval_frac_reward_zero_std": 0.59, "eval_loss": 0.0014860860537737608, "eval_num_tokens": 350717863.0, "eval_reward": 0.764208345413208, "eval_reward_std": 0.08211088687181473, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7642083430290222, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.2958526027202606, "eval_runtime": 417.5732, "eval_samples_per_second": 0.239, "eval_sampling/importance_sampling_ratio/max": 1.9392410516738892, "eval_sampling/importance_sampling_ratio/mean": 1.0000256299972534, "eval_sampling/importance_sampling_ratio/min": 0.30997018457235326, "eval_sampling/sampling_logp_difference/max": 2.1229168796539306, "eval_sampling/sampling_logp_difference/mean": 0.013145581409335137, "eval_steps_per_second": 0.005, "step": 2600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009375, "completions/max_length": 1621.6, "completions/max_terminated_length": 1563.4, "completions/mean_length": 1089.31875, "completions/mean_terminated_length": 1076.3976806640626, "completions/min_length": 790.0, "completions/min_terminated_length": 790.0, "entropy": 0.23385756611824035, "epoch": 3.061104582843713, "frac_reward_zero_std": 0.25, "grad_norm": 0.890629231929779, "learning_rate": 2.0002423067603585e-07, "loss": -0.0198, "num_tokens": 351392561.0, "reward": 0.7359375119209289, "reward_std": 0.17164357751607895, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7359375119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.3002436876296997, "sampling/importance_sampling_ratio/max": 1.9385489702224732, "sampling/importance_sampling_ratio/mean": 1.0000449180603028, "sampling/importance_sampling_ratio/min": 0.430584990978241, "sampling/sampling_logp_difference/max": 0.9488114356994629, "sampling/sampling_logp_difference/mean": 0.012672055885195732, "step": 2605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1486.8, "completions/max_terminated_length": 1486.8, "completions/mean_length": 1123.546875, "completions/mean_terminated_length": 1123.546875, "completions/min_length": 811.6, "completions/min_terminated_length": 811.6, "entropy": 0.24552173912525177, "epoch": 3.066980023501763, "frac_reward_zero_std": 0.5, "grad_norm": 0.0, "learning_rate": 1.994184637751393e-07, "loss": -0.0008, "num_tokens": 352087152.0, "reward": 0.8692708492279053, "reward_std": 0.10569213405251503, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8692708611488342, "rewards/e2e_recall_precision_mixed_reward/std": 0.21039358377456666, "sampling/importance_sampling_ratio/max": 1.9522597312927246, "sampling/importance_sampling_ratio/mean": 0.999960207939148, "sampling/importance_sampling_ratio/min": 0.3872329980134964, "sampling/sampling_logp_difference/max": 1.0530877590179444, "sampling/sampling_logp_difference/mean": 0.01298385914415121, "step": 2610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1603.4, "completions/max_terminated_length": 1603.4, "completions/mean_length": 1123.93125, "completions/mean_terminated_length": 1123.93125, "completions/min_length": 772.4, "completions/min_terminated_length": 772.4, "entropy": 0.23556230068206788, "epoch": 3.072855464159812, "frac_reward_zero_std": 0.65, "grad_norm": 0.8286966681480408, "learning_rate": 1.988126968742428e-07, "loss": -0.003, "num_tokens": 352771130.0, "reward": 0.7554687619209289, "reward_std": 0.05466256886720657, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7554687619209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.2613639414310455, "sampling/importance_sampling_ratio/max": 1.8773372888565063, "sampling/importance_sampling_ratio/mean": 0.9999829411506653, "sampling/importance_sampling_ratio/min": 0.374179807305336, "sampling/sampling_logp_difference/max": 1.0663438320159913, "sampling/sampling_logp_difference/mean": 0.012645265832543374, "step": 2615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1776.4, "completions/max_terminated_length": 1694.8, "completions/mean_length": 1166.609375, "completions/mean_terminated_length": 1162.793359375, "completions/min_length": 750.8, "completions/min_terminated_length": 750.8, "entropy": 0.23736243844032287, "epoch": 3.0787309048178613, "frac_reward_zero_std": 0.75, "grad_norm": 0.40863195061683655, "learning_rate": 1.9820692997334626e-07, "loss": -0.0048, "num_tokens": 353456681.0, "reward": 0.8520833492279053, "reward_std": 0.06194302663207054, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8520833492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.2339508056640625, "sampling/importance_sampling_ratio/max": 1.8806124925613403, "sampling/importance_sampling_ratio/mean": 1.0000197887420654, "sampling/importance_sampling_ratio/min": 0.3954928398132324, "sampling/sampling_logp_difference/max": 0.9414288401603699, "sampling/sampling_logp_difference/mean": 0.012598930113017558, "step": 2620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1588.8, "completions/max_terminated_length": 1588.8, "completions/mean_length": 1101.5, "completions/mean_terminated_length": 1101.5, "completions/min_length": 774.2, "completions/min_terminated_length": 774.2, "entropy": 0.2684099614620209, "epoch": 3.0846063454759105, "frac_reward_zero_std": 0.6, "grad_norm": 0.6336551308631897, "learning_rate": 1.976011630724497e-07, "loss": -0.0025, "num_tokens": 354135657.0, "reward": 0.8446875095367432, "reward_std": 0.06188718155026436, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8446875095367432, "rewards/e2e_recall_precision_mixed_reward/std": 0.2825276643037796, "sampling/importance_sampling_ratio/max": 1.9036636829376221, "sampling/importance_sampling_ratio/mean": 0.9998575687408447, "sampling/importance_sampling_ratio/min": 0.34003419876098634, "sampling/sampling_logp_difference/max": 1.3024271130561829, "sampling/sampling_logp_difference/mean": 0.013950708508491515, "step": 2625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1805.8, "completions/max_terminated_length": 1805.8, "completions/mean_length": 1192.175, "completions/mean_terminated_length": 1192.175, "completions/min_length": 844.6, "completions/min_terminated_length": 844.6, "entropy": 0.2670820116996765, "epoch": 3.09048178613396, "frac_reward_zero_std": 0.75, "grad_norm": 0.0, "learning_rate": 1.9699539617155316e-07, "loss": 0.0015, "num_tokens": 354847601.0, "reward": 0.7770833492279052, "reward_std": 0.055994272232055664, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7770833492279052, "rewards/e2e_recall_precision_mixed_reward/std": 0.28656685948371885, "sampling/importance_sampling_ratio/max": 1.9876051187515258, "sampling/importance_sampling_ratio/mean": 0.9998288512229919, "sampling/importance_sampling_ratio/min": 0.3348564386367798, "sampling/sampling_logp_difference/max": 1.1707114219665526, "sampling/sampling_logp_difference/mean": 0.013887059316039086, "step": 2630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 1861.8, "completions/max_terminated_length": 1836.0, "completions/mean_length": 1216.940625, "completions/mean_terminated_length": 1209.6089599609375, "completions/min_length": 830.6, "completions/min_terminated_length": 830.6, "entropy": 0.25017284154891967, "epoch": 3.0963572267920094, "frac_reward_zero_std": 0.55, "grad_norm": 0.6755982637405396, "learning_rate": 1.9638962927065662e-07, "loss": -0.0063, "num_tokens": 355569526.0, "reward": 0.7526041984558105, "reward_std": 0.1029469721019268, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7526041984558105, "rewards/e2e_recall_precision_mixed_reward/std": 0.3308384269475937, "sampling/importance_sampling_ratio/max": 1.9819922924041748, "sampling/importance_sampling_ratio/mean": 1.0000592708587646, "sampling/importance_sampling_ratio/min": 0.35656105279922484, "sampling/sampling_logp_difference/max": 1.0571811199188232, "sampling/sampling_logp_difference/mean": 0.013306570611894131, "step": 2635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1624.4, "completions/max_terminated_length": 1624.4, "completions/mean_length": 1158.015625, "completions/mean_terminated_length": 1158.015625, "completions/min_length": 862.0, "completions/min_terminated_length": 862.0, "entropy": 0.2682430505752563, "epoch": 3.1022326674500587, "frac_reward_zero_std": 0.4, "grad_norm": 0.8624192476272583, "learning_rate": 1.9578386236976011e-07, "loss": 0.0081, "num_tokens": 356236859.0, "reward": 0.8321875095367431, "reward_std": 0.09990925379097462, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8321875095367431, "rewards/e2e_recall_precision_mixed_reward/std": 0.23295383900403976, "sampling/importance_sampling_ratio/max": 1.927258276939392, "sampling/importance_sampling_ratio/mean": 1.000026774406433, "sampling/importance_sampling_ratio/min": 0.3010182499885559, "sampling/sampling_logp_difference/max": 1.337507677078247, "sampling/sampling_logp_difference/mean": 0.013790984638035297, "step": 2640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1496.6, "completions/max_terminated_length": 1496.6, "completions/mean_length": 1131.265625, "completions/mean_terminated_length": 1131.265625, "completions/min_length": 816.2, "completions/min_terminated_length": 816.2, "entropy": 0.23955595791339873, "epoch": 3.108108108108108, "frac_reward_zero_std": 0.55, "grad_norm": 0.4020726680755615, "learning_rate": 1.9517809546886358e-07, "loss": 0.002, "num_tokens": 356920928.0, "reward": 0.7802083492279053, "reward_std": 0.07774590328335762, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7802083492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.30067052841186526, "sampling/importance_sampling_ratio/max": 1.9295808315277099, "sampling/importance_sampling_ratio/mean": 0.999874758720398, "sampling/importance_sampling_ratio/min": 0.3665719389915466, "sampling/sampling_logp_difference/max": 1.078517460823059, "sampling/sampling_logp_difference/mean": 0.012707036547362804, "step": 2645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1511.6, "completions/max_terminated_length": 1511.6, "completions/mean_length": 1159.884375, "completions/mean_terminated_length": 1159.884375, "completions/min_length": 906.2, "completions/min_terminated_length": 906.2, "entropy": 0.23638878464698793, "epoch": 3.1139835487661576, "frac_reward_zero_std": 0.5, "grad_norm": 0.7374940514564514, "learning_rate": 1.9457232856796704e-07, "loss": 0.0033, "num_tokens": 357597563.0, "reward": 0.8471354246139526, "reward_std": 0.08411812335252762, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8471354246139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.19521130323410035, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999265074729919, "sampling/importance_sampling_ratio/min": 0.253702437877655, "sampling/sampling_logp_difference/max": 1.50885751247406, "sampling/sampling_logp_difference/mean": 0.01256478950381279, "step": 2650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1847.4, "completions/max_terminated_length": 1847.4, "completions/mean_length": 1120.740625, "completions/mean_terminated_length": 1120.740625, "completions/min_length": 848.4, "completions/min_terminated_length": 848.4, "entropy": 0.2554923087358475, "epoch": 3.119858989424207, "frac_reward_zero_std": 0.6, "grad_norm": 0.7112350463867188, "learning_rate": 1.939665616670705e-07, "loss": 0.0001, "num_tokens": 358283448.0, "reward": 0.7602083384990692, "reward_std": 0.07785547077655793, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7602083384990692, "rewards/e2e_recall_precision_mixed_reward/std": 0.296068063378334, "sampling/importance_sampling_ratio/max": 1.9592190265655518, "sampling/importance_sampling_ratio/mean": 1.000104260444641, "sampling/importance_sampling_ratio/min": 0.33376007676124575, "sampling/sampling_logp_difference/max": 1.1508065223693849, "sampling/sampling_logp_difference/mean": 0.01355198472738266, "step": 2655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1480.2, "completions/max_terminated_length": 1480.2, "completions/mean_length": 1106.128125, "completions/mean_terminated_length": 1106.128125, "completions/min_length": 762.4, "completions/min_terminated_length": 762.4, "entropy": 0.22904159724712372, "epoch": 3.125734430082256, "frac_reward_zero_std": 0.45, "grad_norm": 0.7636279463768005, "learning_rate": 1.93360794766174e-07, "loss": 0.0013, "num_tokens": 358951905.0, "reward": 0.8581250190734864, "reward_std": 0.0991522267460823, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8581250190734864, "rewards/e2e_recall_precision_mixed_reward/std": 0.20884974002838136, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999507665634155, "sampling/importance_sampling_ratio/min": 0.3788798153400421, "sampling/sampling_logp_difference/max": 0.9996652841567993, "sampling/sampling_logp_difference/mean": 0.012032361328601837, "step": 2660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1593.2, "completions/max_terminated_length": 1593.2, "completions/mean_length": 1190.990625, "completions/mean_terminated_length": 1190.990625, "completions/min_length": 888.6, "completions/min_terminated_length": 888.6, "entropy": 0.2583101183176041, "epoch": 3.1316098707403057, "frac_reward_zero_std": 0.55, "grad_norm": 0.444450706243515, "learning_rate": 1.9275502786527743e-07, "loss": -0.0021, "num_tokens": 359662430.0, "reward": 0.7358854293823243, "reward_std": 0.10120119452476502, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7358854293823243, "rewards/e2e_recall_precision_mixed_reward/std": 0.35854376256465914, "sampling/importance_sampling_ratio/max": 1.9906119108200073, "sampling/importance_sampling_ratio/mean": 0.9998132705688476, "sampling/importance_sampling_ratio/min": 0.28991485238075254, "sampling/sampling_logp_difference/max": 1.2943898916244507, "sampling/sampling_logp_difference/mean": 0.013436655886471272, "step": 2665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1751.8, "completions/max_terminated_length": 1751.8, "completions/mean_length": 1202.046875, "completions/mean_terminated_length": 1202.046875, "completions/min_length": 846.2, "completions/min_terminated_length": 846.2, "entropy": 0.25281980633735657, "epoch": 3.137485311398355, "frac_reward_zero_std": 0.3, "grad_norm": 0.7313327789306641, "learning_rate": 1.921492609643809e-07, "loss": -0.0009, "num_tokens": 360371181.0, "reward": 0.7715625166893005, "reward_std": 0.13862871527671813, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7715625166893005, "rewards/e2e_recall_precision_mixed_reward/std": 0.32820015847682954, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000031113624572, "sampling/importance_sampling_ratio/min": 0.4037406623363495, "sampling/sampling_logp_difference/max": 1.0691129326820374, "sampling/sampling_logp_difference/mean": 0.013268834352493286, "step": 2670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1509.4, "completions/max_terminated_length": 1509.4, "completions/mean_length": 1149.64375, "completions/mean_terminated_length": 1149.64375, "completions/min_length": 883.0, "completions/min_terminated_length": 883.0, "entropy": 0.25547915995121, "epoch": 3.143360752056404, "frac_reward_zero_std": 0.7, "grad_norm": 0.43228134512901306, "learning_rate": 1.9154349406348435e-07, "loss": -0.002, "num_tokens": 361056763.0, "reward": 0.9020833492279052, "reward_std": 0.06553339175879955, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9020833492279052, "rewards/e2e_recall_precision_mixed_reward/std": 0.1975732207298279, "sampling/importance_sampling_ratio/max": 1.9426953315734863, "sampling/importance_sampling_ratio/mean": 1.000106644630432, "sampling/importance_sampling_ratio/min": 0.338831490278244, "sampling/sampling_logp_difference/max": 1.2730365514755249, "sampling/sampling_logp_difference/mean": 0.01309330053627491, "step": 2675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1539.4, "completions/max_terminated_length": 1539.4, "completions/mean_length": 1171.35625, "completions/mean_terminated_length": 1171.35625, "completions/min_length": 934.2, "completions/min_terminated_length": 934.2, "entropy": 0.2575560688972473, "epoch": 3.1492361927144534, "frac_reward_zero_std": 0.6, "grad_norm": 0.0, "learning_rate": 1.9093772716258782e-07, "loss": -0.0022, "num_tokens": 361735949.0, "reward": 0.7588020920753479, "reward_std": 0.09332804828882217, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7588020920753479, "rewards/e2e_recall_precision_mixed_reward/std": 0.3088227480649948, "sampling/importance_sampling_ratio/max": 1.981947898864746, "sampling/importance_sampling_ratio/mean": 0.9999451875686646, "sampling/importance_sampling_ratio/min": 0.3808894753456116, "sampling/sampling_logp_difference/max": 0.9771119594573975, "sampling/sampling_logp_difference/mean": 0.013271708972752094, "step": 2680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1766.6, "completions/max_terminated_length": 1766.6, "completions/mean_length": 1225.53125, "completions/mean_terminated_length": 1225.53125, "completions/min_length": 905.6, "completions/min_terminated_length": 905.6, "entropy": 0.2539756655693054, "epoch": 3.155111633372503, "frac_reward_zero_std": 0.4, "grad_norm": 0.5203064680099487, "learning_rate": 1.903319602616913e-07, "loss": 0.0048, "num_tokens": 362455095.0, "reward": 0.8886458516120911, "reward_std": 0.11133290827274323, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8886458516120911, "rewards/e2e_recall_precision_mixed_reward/std": 0.1978231579065323, "sampling/importance_sampling_ratio/max": 1.8321751832962037, "sampling/importance_sampling_ratio/mean": 1.0000116109848023, "sampling/importance_sampling_ratio/min": 0.3633933126926422, "sampling/sampling_logp_difference/max": 1.0385831832885741, "sampling/sampling_logp_difference/mean": 0.01303493045270443, "step": 2685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1516.4, "completions/max_terminated_length": 1516.4, "completions/mean_length": 1171.89375, "completions/mean_terminated_length": 1171.89375, "completions/min_length": 912.4, "completions/min_terminated_length": 912.4, "entropy": 0.25234430730342866, "epoch": 3.1609870740305523, "frac_reward_zero_std": 0.45, "grad_norm": 0.6282800436019897, "learning_rate": 1.8972619336079477e-07, "loss": 0.0072, "num_tokens": 363167109.0, "reward": 0.752343761920929, "reward_std": 0.09111793488264083, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7523437738418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.30076233446598055, "sampling/importance_sampling_ratio/max": 1.8481305837631226, "sampling/importance_sampling_ratio/mean": 0.9998624682426452, "sampling/importance_sampling_ratio/min": 0.39715090990066526, "sampling/sampling_logp_difference/max": 0.9621063709259033, "sampling/sampling_logp_difference/mean": 0.01284482330083847, "step": 2690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1668.6, "completions/max_terminated_length": 1668.6, "completions/mean_length": 1182.228125, "completions/mean_terminated_length": 1182.228125, "completions/min_length": 896.4, "completions/min_terminated_length": 896.4, "entropy": 0.2626140534877777, "epoch": 3.1668625146886016, "frac_reward_zero_std": 0.5, "grad_norm": 0.6545633673667908, "learning_rate": 1.8912042645989823e-07, "loss": -0.0016, "num_tokens": 363846078.0, "reward": 0.8580729246139527, "reward_std": 0.11600885093212128, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8580729246139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.22339286804199218, "sampling/importance_sampling_ratio/max": 1.9617747783660888, "sampling/importance_sampling_ratio/mean": 0.9999674677848815, "sampling/importance_sampling_ratio/min": 0.3762934744358063, "sampling/sampling_logp_difference/max": 1.0206952333450316, "sampling/sampling_logp_difference/mean": 0.013403966650366783, "step": 2695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1689.8, "completions/max_terminated_length": 1689.8, "completions/mean_length": 1165.94375, "completions/mean_terminated_length": 1165.94375, "completions/min_length": 817.2, "completions/min_terminated_length": 817.2, "entropy": 0.24900731146335603, "epoch": 3.172737955346651, "frac_reward_zero_std": 0.75, "grad_norm": 0.6175356507301331, "learning_rate": 1.885146595590017e-07, "loss": 0.0045, "num_tokens": 364534908.0, "reward": 0.881250011920929, "reward_std": 0.04110444188117981, "rewards/e2e_recall_precision_mixed_reward/mean": 0.881250011920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.21535277664661406, "sampling/importance_sampling_ratio/max": 1.8730939149856567, "sampling/importance_sampling_ratio/mean": 1.0000895857810974, "sampling/importance_sampling_ratio/min": 0.3922951459884644, "sampling/sampling_logp_difference/max": 0.9598684906959534, "sampling/sampling_logp_difference/mean": 0.01295645758509636, "step": 2700 }, { "epoch": 3.172737955346651, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1605.36, "eval_completions/max_terminated_length": 1605.36, "eval_completions/mean_length": 1145.510625, "eval_completions/mean_terminated_length": 1145.510625, "eval_completions/min_length": 868.64, "eval_completions/min_terminated_length": 868.64, "eval_entropy": 0.25542369663715364, "eval_frac_reward_zero_std": 0.62, "eval_loss": 0.002036468591541052, "eval_num_tokens": 364534908.0, "eval_reward": 0.7656979310512543, "eval_reward_std": 0.07679802820086479, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7656979298591614, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.2901380580663681, "eval_runtime": 440.2857, "eval_samples_per_second": 0.227, "eval_sampling/importance_sampling_ratio/max": 1.976495280265808, "eval_sampling/importance_sampling_ratio/mean": 1.0000054264068603, "eval_sampling/importance_sampling_ratio/min": 0.33669356286525726, "eval_sampling/sampling_logp_difference/max": 1.1901243042945862, "eval_sampling/sampling_logp_difference/mean": 0.013190858326852321, "eval_steps_per_second": 0.005, "step": 2700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1389.0, "completions/max_terminated_length": 1389.0, "completions/mean_length": 1077.025, "completions/mean_terminated_length": 1077.025, "completions/min_length": 785.2, "completions/min_terminated_length": 785.2, "entropy": 0.22875811159610748, "epoch": 3.1786133960047005, "frac_reward_zero_std": 0.7, "grad_norm": 0.5512154698371887, "learning_rate": 1.8790889265810513e-07, "loss": 0.0026, "num_tokens": 365209364.0, "reward": 0.950000011920929, "reward_std": 0.06352402791380882, "rewards/e2e_recall_precision_mixed_reward/mean": 0.950000011920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.13289770856499672, "sampling/importance_sampling_ratio/max": 1.9655184507369996, "sampling/importance_sampling_ratio/mean": 0.999955701828003, "sampling/importance_sampling_ratio/min": 0.4531009137630463, "sampling/sampling_logp_difference/max": 0.8712506055831909, "sampling/sampling_logp_difference/mean": 0.011957179754972458, "step": 2705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1785.4, "completions/max_terminated_length": 1785.4, "completions/mean_length": 1248.996875, "completions/mean_terminated_length": 1248.996875, "completions/min_length": 848.6, "completions/min_terminated_length": 848.6, "entropy": 0.2754764974117279, "epoch": 3.1844888366627497, "frac_reward_zero_std": 0.55, "grad_norm": 0.0, "learning_rate": 1.8730312575720862e-07, "loss": 0.0031, "num_tokens": 365918243.0, "reward": 0.7947916746139526, "reward_std": 0.07183088660240174, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7947916746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.2950818926095963, "sampling/importance_sampling_ratio/max": 1.963321566581726, "sampling/importance_sampling_ratio/mean": 0.9999127626419068, "sampling/importance_sampling_ratio/min": 0.33920138217235946, "sampling/sampling_logp_difference/max": 3.2019110441207888, "sampling/sampling_logp_difference/mean": 0.013784621469676494, "step": 2710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1431.2, "completions/max_terminated_length": 1431.2, "completions/mean_length": 1097.05, "completions/mean_terminated_length": 1097.05, "completions/min_length": 799.8, "completions/min_terminated_length": 799.8, "entropy": 0.23867568373680115, "epoch": 3.190364277320799, "frac_reward_zero_std": 0.5, "grad_norm": 0.5815135836601257, "learning_rate": 1.8669735885631208e-07, "loss": 0.0015, "num_tokens": 366575027.0, "reward": 0.839062511920929, "reward_std": 0.11312199383974075, "rewards/e2e_recall_precision_mixed_reward/mean": 0.839062511920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.25024967789649966, "sampling/importance_sampling_ratio/max": 1.9745015382766724, "sampling/importance_sampling_ratio/mean": 0.9999501466751098, "sampling/importance_sampling_ratio/min": 0.2752092361450195, "sampling/sampling_logp_difference/max": 1.4468851327896117, "sampling/sampling_logp_difference/mean": 0.012776543572545051, "step": 2715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1545.8, "completions/max_terminated_length": 1545.8, "completions/mean_length": 1100.778125, "completions/mean_terminated_length": 1100.778125, "completions/min_length": 741.6, "completions/min_terminated_length": 741.6, "entropy": 0.23921369910240173, "epoch": 3.196239717978848, "frac_reward_zero_std": 0.65, "grad_norm": 0.3939354121685028, "learning_rate": 1.8609159195541555e-07, "loss": -0.0015, "num_tokens": 367257964.0, "reward": 0.71171875, "reward_std": 0.04870991818606853, "rewards/e2e_recall_precision_mixed_reward/mean": 0.711718761920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.3381441444158554, "sampling/importance_sampling_ratio/max": 1.9069401264190673, "sampling/importance_sampling_ratio/mean": 1.0001029253005982, "sampling/importance_sampling_ratio/min": 0.37758385539054873, "sampling/sampling_logp_difference/max": 1.0444597005844116, "sampling/sampling_logp_difference/mean": 0.012857604771852493, "step": 2720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1508.4, "completions/max_terminated_length": 1508.4, "completions/mean_length": 1143.01875, "completions/mean_terminated_length": 1143.01875, "completions/min_length": 891.0, "completions/min_terminated_length": 891.0, "entropy": 0.25859539210796356, "epoch": 3.202115158636898, "frac_reward_zero_std": 0.75, "grad_norm": 0.0, "learning_rate": 1.85485825054519e-07, "loss": 0.0021, "num_tokens": 367949282.0, "reward": 0.8816145896911621, "reward_std": 0.04836602807044983, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8816145896911621, "rewards/e2e_recall_precision_mixed_reward/std": 0.15824837386608123, "sampling/importance_sampling_ratio/max": 1.9535146951675415, "sampling/importance_sampling_ratio/mean": 0.9999256372451782, "sampling/importance_sampling_ratio/min": 0.3492464393377304, "sampling/sampling_logp_difference/max": 1.4997458934783936, "sampling/sampling_logp_difference/mean": 0.01358124241232872, "step": 2725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1605.8, "completions/max_terminated_length": 1605.8, "completions/mean_length": 1144.803125, "completions/mean_terminated_length": 1144.803125, "completions/min_length": 821.6, "completions/min_terminated_length": 821.6, "entropy": 0.23201414942741394, "epoch": 3.207990599294947, "frac_reward_zero_std": 0.35, "grad_norm": 0.4138811528682709, "learning_rate": 1.8488005815362247e-07, "loss": 0.0035, "num_tokens": 368656163.0, "reward": 0.8598958492279053, "reward_std": 0.13527624905109406, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8598958492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.21621148884296418, "sampling/importance_sampling_ratio/max": 1.981991744041443, "sampling/importance_sampling_ratio/mean": 0.9999581217765808, "sampling/importance_sampling_ratio/min": 0.25117518454790116, "sampling/sampling_logp_difference/max": 1.7178606867790223, "sampling/sampling_logp_difference/mean": 0.01244208738207817, "step": 2730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1546.2, "completions/max_terminated_length": 1546.2, "completions/mean_length": 1133.43125, "completions/mean_terminated_length": 1133.43125, "completions/min_length": 841.0, "completions/min_terminated_length": 841.0, "entropy": 0.24755406975746155, "epoch": 3.2138660399529964, "frac_reward_zero_std": 0.7, "grad_norm": 0.6708360910415649, "learning_rate": 1.8427429125272596e-07, "loss": 0.0021, "num_tokens": 369316909.0, "reward": 0.8466145992279053, "reward_std": 0.05173058435320854, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8466145992279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.2515969604253769, "sampling/importance_sampling_ratio/max": 1.9390371322631836, "sampling/importance_sampling_ratio/mean": 0.9999060034751892, "sampling/importance_sampling_ratio/min": 0.317415851354599, "sampling/sampling_logp_difference/max": 1.308079767227173, "sampling/sampling_logp_difference/mean": 0.012958027422428131, "step": 2735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1652.6, "completions/max_terminated_length": 1652.6, "completions/mean_length": 1203.203125, "completions/mean_terminated_length": 1203.203125, "completions/min_length": 840.4, "completions/min_terminated_length": 840.4, "entropy": 0.260967755317688, "epoch": 3.219741480611046, "frac_reward_zero_std": 0.55, "grad_norm": 0.7388750314712524, "learning_rate": 1.8366852435182942e-07, "loss": -0.0027, "num_tokens": 370055742.0, "reward": 0.8242187619209289, "reward_std": 0.08670442402362824, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8242187619209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.25668842494487765, "sampling/importance_sampling_ratio/max": 1.9822523593902588, "sampling/importance_sampling_ratio/mean": 1.0000456333160401, "sampling/importance_sampling_ratio/min": 0.4507920503616333, "sampling/sampling_logp_difference/max": 0.8359850645065308, "sampling/sampling_logp_difference/mean": 0.013544493354856968, "step": 2740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1580.0, "completions/max_terminated_length": 1580.0, "completions/mean_length": 1152.165625, "completions/mean_terminated_length": 1152.165625, "completions/min_length": 845.6, "completions/min_terminated_length": 845.6, "entropy": 0.24874697625637054, "epoch": 3.2256169212690953, "frac_reward_zero_std": 0.5, "grad_norm": 0.6925174593925476, "learning_rate": 1.830627574509329e-07, "loss": 0.0021, "num_tokens": 370757203.0, "reward": 0.9194791913032532, "reward_std": 0.08012870997190476, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9194791913032532, "rewards/e2e_recall_precision_mixed_reward/std": 0.1872466579079628, "sampling/importance_sampling_ratio/max": 1.917311930656433, "sampling/importance_sampling_ratio/mean": 1.0000473737716675, "sampling/importance_sampling_ratio/min": 0.3609789401292801, "sampling/sampling_logp_difference/max": 1.0609714031219482, "sampling/sampling_logp_difference/mean": 0.013005911372601986, "step": 2745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1640.8, "completions/max_terminated_length": 1640.8, "completions/mean_length": 1168.940625, "completions/mean_terminated_length": 1168.940625, "completions/min_length": 891.2, "completions/min_terminated_length": 891.2, "entropy": 0.25572892725467683, "epoch": 3.2314923619271445, "frac_reward_zero_std": 0.6, "grad_norm": 0.42577195167541504, "learning_rate": 1.8245699055003632e-07, "loss": -0.0107, "num_tokens": 371470048.0, "reward": 0.8330729484558106, "reward_std": 0.1004362728446722, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8330729484558106, "rewards/e2e_recall_precision_mixed_reward/std": 0.2636726438999176, "sampling/importance_sampling_ratio/max": 1.963629388809204, "sampling/importance_sampling_ratio/mean": 1.000008511543274, "sampling/importance_sampling_ratio/min": 0.3254010289907455, "sampling/sampling_logp_difference/max": 1.3843281507492065, "sampling/sampling_logp_difference/mean": 0.013244516961276532, "step": 2750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1605.8, "completions/max_terminated_length": 1605.8, "completions/mean_length": 1150.315625, "completions/mean_terminated_length": 1150.315625, "completions/min_length": 817.0, "completions/min_terminated_length": 817.0, "entropy": 0.27356540560722353, "epoch": 3.2373678025851937, "frac_reward_zero_std": 0.6, "grad_norm": 0.6596607565879822, "learning_rate": 1.8185122364913979e-07, "loss": -0.0018, "num_tokens": 372153701.0, "reward": 0.8119791865348815, "reward_std": 0.07490371987223625, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8119791865348815, "rewards/e2e_recall_precision_mixed_reward/std": 0.26950059831142426, "sampling/importance_sampling_ratio/max": 1.9359875917434692, "sampling/importance_sampling_ratio/mean": 1.0000138759613038, "sampling/importance_sampling_ratio/min": 0.3807866334915161, "sampling/sampling_logp_difference/max": 1.138900876045227, "sampling/sampling_logp_difference/mean": 0.014055828377604485, "step": 2755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1861.2, "completions/max_terminated_length": 1861.2, "completions/mean_length": 1226.828125, "completions/mean_terminated_length": 1226.828125, "completions/min_length": 842.0, "completions/min_terminated_length": 842.0, "entropy": 0.25837584137916564, "epoch": 3.2432432432432434, "frac_reward_zero_std": 0.65, "grad_norm": 0.0, "learning_rate": 1.8124545674824328e-07, "loss": 0.0002, "num_tokens": 372891134.0, "reward": 0.7364583492279053, "reward_std": 0.0557606402784586, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7364583492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.3232154339551926, "sampling/importance_sampling_ratio/max": 1.9625327587127686, "sampling/importance_sampling_ratio/mean": 1.0000850915908814, "sampling/importance_sampling_ratio/min": 0.3799369066953659, "sampling/sampling_logp_difference/max": 1.1325337648391725, "sampling/sampling_logp_difference/mean": 0.01331845298409462, "step": 2760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1545.6, "completions/max_terminated_length": 1545.6, "completions/mean_length": 1169.91875, "completions/mean_terminated_length": 1169.91875, "completions/min_length": 870.0, "completions/min_terminated_length": 870.0, "entropy": 0.2515501230955124, "epoch": 3.2491186839012927, "frac_reward_zero_std": 0.5, "grad_norm": 0.5899607539176941, "learning_rate": 1.8063968984734674e-07, "loss": 0.0021, "num_tokens": 373575492.0, "reward": 0.8873437523841858, "reward_std": 0.07848574072122574, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8873437523841858, "rewards/e2e_recall_precision_mixed_reward/std": 0.19398850649595262, "sampling/importance_sampling_ratio/max": 1.8839639186859132, "sampling/importance_sampling_ratio/mean": 0.9999752283096314, "sampling/importance_sampling_ratio/min": 0.353549987077713, "sampling/sampling_logp_difference/max": 1.1025007724761964, "sampling/sampling_logp_difference/mean": 0.013109220005571843, "step": 2765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1605.4, "completions/max_terminated_length": 1605.4, "completions/mean_length": 1208.865625, "completions/mean_terminated_length": 1208.865625, "completions/min_length": 946.2, "completions/min_terminated_length": 946.2, "entropy": 0.262452107667923, "epoch": 3.254994124559342, "frac_reward_zero_std": 0.6, "grad_norm": 0.6078686714172363, "learning_rate": 1.800339229464502e-07, "loss": 0.0051, "num_tokens": 374303577.0, "reward": 0.7220833420753479, "reward_std": 0.07195360362529754, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7220833420753479, "rewards/e2e_recall_precision_mixed_reward/std": 0.3346565842628479, "sampling/importance_sampling_ratio/max": 1.9912936210632324, "sampling/importance_sampling_ratio/mean": 0.9999467730522156, "sampling/importance_sampling_ratio/min": 0.319632551074028, "sampling/sampling_logp_difference/max": 1.1714837074279785, "sampling/sampling_logp_difference/mean": 0.01357471402734518, "step": 2770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1565.4, "completions/max_terminated_length": 1565.4, "completions/mean_length": 1132.821875, "completions/mean_terminated_length": 1132.821875, "completions/min_length": 803.6, "completions/min_terminated_length": 803.6, "entropy": 0.2516249448060989, "epoch": 3.260869565217391, "frac_reward_zero_std": 0.65, "grad_norm": 0.715618908405304, "learning_rate": 1.7942815604555366e-07, "loss": -0.0009, "num_tokens": 374955184.0, "reward": 0.8385416746139527, "reward_std": 0.056407293677330016, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8385416865348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.20846658647060395, "sampling/importance_sampling_ratio/max": 1.827885890007019, "sampling/importance_sampling_ratio/mean": 1.000091540813446, "sampling/importance_sampling_ratio/min": 0.37121389210224154, "sampling/sampling_logp_difference/max": 1.0708453416824342, "sampling/sampling_logp_difference/mean": 0.013041505217552185, "step": 2775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1532.4, "completions/max_terminated_length": 1532.4, "completions/mean_length": 1131.478125, "completions/mean_terminated_length": 1131.478125, "completions/min_length": 832.4, "completions/min_terminated_length": 832.4, "entropy": 0.24903405606746673, "epoch": 3.266745005875441, "frac_reward_zero_std": 0.6, "grad_norm": 0.7905736565589905, "learning_rate": 1.7882238914465713e-07, "loss": 0.0009, "num_tokens": 375624905.0, "reward": 0.9010416865348816, "reward_std": 0.08152148127555847, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9010416865348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.1657480537891388, "sampling/importance_sampling_ratio/max": 1.894256854057312, "sampling/importance_sampling_ratio/mean": 0.9999601244926453, "sampling/importance_sampling_ratio/min": 0.3288370221853256, "sampling/sampling_logp_difference/max": 1.1827893733978272, "sampling/sampling_logp_difference/mean": 0.012998024187982082, "step": 2780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1464.0, "completions/max_terminated_length": 1464.0, "completions/mean_length": 1107.759375, "completions/mean_terminated_length": 1107.759375, "completions/min_length": 828.0, "completions/min_terminated_length": 828.0, "entropy": 0.2499726951122284, "epoch": 3.27262044653349, "frac_reward_zero_std": 0.8, "grad_norm": 0.5189323425292969, "learning_rate": 1.7821662224376062e-07, "loss": 0.001, "num_tokens": 376336844.0, "reward": 0.8536458492279053, "reward_std": 0.04098552390933037, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8536458492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.24642951488494874, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000021255016327, "sampling/importance_sampling_ratio/min": 0.4128114223480225, "sampling/sampling_logp_difference/max": 1.009805178642273, "sampling/sampling_logp_difference/mean": 0.013290311396121978, "step": 2785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 1713.4, "completions/max_terminated_length": 1711.2, "completions/mean_length": 1176.153125, "completions/mean_terminated_length": 1169.3762939453125, "completions/min_length": 884.4, "completions/min_terminated_length": 884.4, "entropy": 0.25248327255249026, "epoch": 3.2784958871915393, "frac_reward_zero_std": 0.55, "grad_norm": 0.42183059453964233, "learning_rate": 1.7761085534286405e-07, "loss": -0.0027, "num_tokens": 377038645.0, "reward": 0.8496875047683716, "reward_std": 0.08127287812530995, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8496875047683716, "rewards/e2e_recall_precision_mixed_reward/std": 0.24507201015949248, "sampling/importance_sampling_ratio/max": 1.9258476972579956, "sampling/importance_sampling_ratio/mean": 1.0000847578048706, "sampling/importance_sampling_ratio/min": 0.3457289457321167, "sampling/sampling_logp_difference/max": 1.0756362676620483, "sampling/sampling_logp_difference/mean": 0.013472091406583786, "step": 2790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1634.0, "completions/max_terminated_length": 1634.0, "completions/mean_length": 1164.784375, "completions/mean_terminated_length": 1164.784375, "completions/min_length": 835.6, "completions/min_terminated_length": 835.6, "entropy": 0.2675493985414505, "epoch": 3.2843713278495885, "frac_reward_zero_std": 0.65, "grad_norm": 0.43047988414764404, "learning_rate": 1.7700508844196752e-07, "loss": -0.001, "num_tokens": 377747424.0, "reward": 0.8713541746139526, "reward_std": 0.10082083642482757, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8713541746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.27632923126220704, "sampling/importance_sampling_ratio/max": 1.967568302154541, "sampling/importance_sampling_ratio/mean": 0.9999858021736145, "sampling/importance_sampling_ratio/min": 0.3365649715065956, "sampling/sampling_logp_difference/max": 1.2312506914138794, "sampling/sampling_logp_difference/mean": 0.013835505396127701, "step": 2795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1770.2, "completions/max_terminated_length": 1714.6, "completions/mean_length": 1174.909375, "completions/mean_terminated_length": 1171.104638671875, "completions/min_length": 831.0, "completions/min_terminated_length": 831.0, "entropy": 0.2529326885938644, "epoch": 3.290246768507638, "frac_reward_zero_std": 0.4, "grad_norm": 0.45852166414260864, "learning_rate": 1.7639932154107098e-07, "loss": -0.0045, "num_tokens": 378429455.0, "reward": 0.8065625309944153, "reward_std": 0.12179329991340637, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8065625190734863, "rewards/e2e_recall_precision_mixed_reward/std": 0.25495838224887846, "sampling/importance_sampling_ratio/max": 1.95216703414917, "sampling/importance_sampling_ratio/mean": 0.9999139785766602, "sampling/importance_sampling_ratio/min": 0.3560261070728302, "sampling/sampling_logp_difference/max": 1.1213287830352783, "sampling/sampling_logp_difference/mean": 0.013436216115951537, "step": 2800 }, { "epoch": 3.290246768507638, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1508.4, "eval_completions/max_terminated_length": 1508.4, "eval_completions/mean_length": 1131.9075, "eval_completions/mean_terminated_length": 1131.9075, "eval_completions/min_length": 855.72, "eval_completions/min_terminated_length": 855.72, "eval_entropy": 0.25337180495262146, "eval_frac_reward_zero_std": 0.56, "eval_loss": -0.00035420674248598516, "eval_num_tokens": 378429455.0, "eval_reward": 0.7613020920753479, "eval_reward_std": 0.08451422370970249, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7613020932674408, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.2949679693579674, "eval_runtime": 419.5616, "eval_samples_per_second": 0.238, "eval_sampling/importance_sampling_ratio/max": 1.9767439079284668, "eval_sampling/importance_sampling_ratio/mean": 1.0000990962982177, "eval_sampling/importance_sampling_ratio/min": 0.31874189218506216, "eval_sampling/sampling_logp_difference/max": 1.4742623805999755, "eval_sampling/sampling_logp_difference/mean": 0.013241245038807392, "eval_steps_per_second": 0.005, "step": 2800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1747.4, "completions/max_terminated_length": 1747.4, "completions/mean_length": 1235.028125, "completions/mean_terminated_length": 1235.028125, "completions/min_length": 959.8, "completions/min_terminated_length": 959.8, "entropy": 0.24830776453018188, "epoch": 3.2961222091656874, "frac_reward_zero_std": 0.55, "grad_norm": 0.3905949592590332, "learning_rate": 1.7579355464017444e-07, "loss": 0.0029, "num_tokens": 379146824.0, "reward": 0.8643229365348816, "reward_std": 0.09152939319610595, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8643229365348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.23323871195316315, "sampling/importance_sampling_ratio/max": 1.9724785804748535, "sampling/importance_sampling_ratio/mean": 1.0000085353851318, "sampling/importance_sampling_ratio/min": 0.34535167515277865, "sampling/sampling_logp_difference/max": 1.126982283592224, "sampling/sampling_logp_difference/mean": 0.012922433577477932, "step": 2805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1577.4, "completions/max_terminated_length": 1577.4, "completions/mean_length": 1131.065625, "completions/mean_terminated_length": 1131.065625, "completions/min_length": 853.4, "completions/min_terminated_length": 853.4, "entropy": 0.26412600874900816, "epoch": 3.3019976498237367, "frac_reward_zero_std": 0.7, "grad_norm": 0.6829572916030884, "learning_rate": 1.7518778773927793e-07, "loss": -0.0053, "num_tokens": 379835933.0, "reward": 0.870312511920929, "reward_std": 0.05251576006412506, "rewards/e2e_recall_precision_mixed_reward/mean": 0.870312511920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.22497628554701804, "sampling/importance_sampling_ratio/max": 1.9401256084442138, "sampling/importance_sampling_ratio/mean": 1.00009765625, "sampling/importance_sampling_ratio/min": 0.3727611839771271, "sampling/sampling_logp_difference/max": 1.0502751111984252, "sampling/sampling_logp_difference/mean": 0.013848403468728065, "step": 2810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1655.6, "completions/max_terminated_length": 1655.6, "completions/mean_length": 1195.8375, "completions/mean_terminated_length": 1195.8375, "completions/min_length": 827.6, "completions/min_terminated_length": 827.6, "entropy": 0.24685363173484803, "epoch": 3.3078730904817863, "frac_reward_zero_std": 0.55, "grad_norm": 0.6114619970321655, "learning_rate": 1.745820208383814e-07, "loss": -0.0025, "num_tokens": 380550489.0, "reward": 0.8239583492279052, "reward_std": 0.09283133745193481, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8239583492279052, "rewards/e2e_recall_precision_mixed_reward/std": 0.28367829620838164, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999561309814453, "sampling/importance_sampling_ratio/min": 0.31022571623325346, "sampling/sampling_logp_difference/max": 1.3538453340530396, "sampling/sampling_logp_difference/mean": 0.013016180135309696, "step": 2815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1631.6, "completions/max_terminated_length": 1631.6, "completions/mean_length": 1201.109375, "completions/mean_terminated_length": 1201.109375, "completions/min_length": 895.0, "completions/min_terminated_length": 895.0, "entropy": 0.25497291088104246, "epoch": 3.3137485311398356, "frac_reward_zero_std": 0.6, "grad_norm": 0.4095427393913269, "learning_rate": 1.7397625393748486e-07, "loss": 0.0055, "num_tokens": 381267212.0, "reward": 0.815625011920929, "reward_std": 0.0877026379108429, "rewards/e2e_recall_precision_mixed_reward/mean": 0.815625011920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.2964575469493866, "sampling/importance_sampling_ratio/max": 1.9889522314071655, "sampling/importance_sampling_ratio/mean": 0.9999091506004334, "sampling/importance_sampling_ratio/min": 0.3753842532634735, "sampling/sampling_logp_difference/max": 1.032554531097412, "sampling/sampling_logp_difference/mean": 0.013489954732358455, "step": 2820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1559.2, "completions/max_terminated_length": 1559.2, "completions/mean_length": 1157.134375, "completions/mean_terminated_length": 1157.134375, "completions/min_length": 814.6, "completions/min_terminated_length": 814.6, "entropy": 0.25930267572402954, "epoch": 3.319623971797885, "frac_reward_zero_std": 0.55, "grad_norm": 0.6701661348342896, "learning_rate": 1.7337048703658832e-07, "loss": -0.0002, "num_tokens": 381964775.0, "reward": 0.8198958396911621, "reward_std": 0.106954687833786, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8198958396911621, "rewards/e2e_recall_precision_mixed_reward/std": 0.2636798769235611, "sampling/importance_sampling_ratio/max": 1.962524652481079, "sampling/importance_sampling_ratio/mean": 1.0000115036964417, "sampling/importance_sampling_ratio/min": 0.23018737211823465, "sampling/sampling_logp_difference/max": 1.7914230823516846, "sampling/sampling_logp_difference/mean": 0.013436510972678662, "step": 2825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1636.4, "completions/max_terminated_length": 1636.4, "completions/mean_length": 1192.090625, "completions/mean_terminated_length": 1192.090625, "completions/min_length": 817.0, "completions/min_terminated_length": 817.0, "entropy": 0.259701132774353, "epoch": 3.325499412455934, "frac_reward_zero_std": 0.55, "grad_norm": 0.5898331999778748, "learning_rate": 1.7276472013569176e-07, "loss": -0.0043, "num_tokens": 382688948.0, "reward": 0.7366666793823242, "reward_std": 0.09564396217465401, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7366666793823242, "rewards/e2e_recall_precision_mixed_reward/std": 0.31994677186012266, "sampling/importance_sampling_ratio/max": 1.8907454490661622, "sampling/importance_sampling_ratio/mean": 0.9998792886734009, "sampling/importance_sampling_ratio/min": 0.35978134274482726, "sampling/sampling_logp_difference/max": 1.0968676328659057, "sampling/sampling_logp_difference/mean": 0.013501750491559505, "step": 2830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1637.4, "completions/max_terminated_length": 1637.4, "completions/mean_length": 1182.41875, "completions/mean_terminated_length": 1182.41875, "completions/min_length": 785.6, "completions/min_terminated_length": 785.6, "entropy": 0.23543883562088014, "epoch": 3.3313748531139837, "frac_reward_zero_std": 0.55, "grad_norm": 0.5775417685508728, "learning_rate": 1.7215895323479524e-07, "loss": 0.0014, "num_tokens": 383374234.0, "reward": 0.8911458492279053, "reward_std": 0.11471164971590042, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8911458492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.20149447470903398, "sampling/importance_sampling_ratio/max": 1.8971025705337525, "sampling/importance_sampling_ratio/mean": 1.0000097513198853, "sampling/importance_sampling_ratio/min": 0.38829224109649657, "sampling/sampling_logp_difference/max": 1.031450629234314, "sampling/sampling_logp_difference/mean": 0.012517093122005463, "step": 2835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1589.0, "completions/max_terminated_length": 1589.0, "completions/mean_length": 1181.99375, "completions/mean_terminated_length": 1181.99375, "completions/min_length": 906.2, "completions/min_terminated_length": 906.2, "entropy": 0.26151891946792605, "epoch": 3.337250293772033, "frac_reward_zero_std": 0.6, "grad_norm": 0.4710766673088074, "learning_rate": 1.715531863338987e-07, "loss": -0.002, "num_tokens": 384075800.0, "reward": 0.7330729365348816, "reward_std": 0.09073313027620315, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7330729365348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.3040100812911987, "sampling/importance_sampling_ratio/max": 1.8934704780578613, "sampling/importance_sampling_ratio/mean": 1.000049901008606, "sampling/importance_sampling_ratio/min": 0.3532286584377289, "sampling/sampling_logp_difference/max": 1.0922445774078369, "sampling/sampling_logp_difference/mean": 0.013711506128311157, "step": 2840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1627.6, "completions/max_terminated_length": 1627.6, "completions/mean_length": 1227.815625, "completions/mean_terminated_length": 1227.815625, "completions/min_length": 944.4, "completions/min_terminated_length": 944.4, "entropy": 0.2659549415111542, "epoch": 3.343125734430082, "frac_reward_zero_std": 0.7, "grad_norm": 0.0, "learning_rate": 1.7094741943300217e-07, "loss": 0.0042, "num_tokens": 384780157.0, "reward": 0.8932291746139527, "reward_std": 0.06313644722104073, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8932291746139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.19119044840335847, "sampling/importance_sampling_ratio/max": 1.9835964679718017, "sampling/importance_sampling_ratio/mean": 1.000018262863159, "sampling/importance_sampling_ratio/min": 0.35918577909469607, "sampling/sampling_logp_difference/max": 1.0723339557647704, "sampling/sampling_logp_difference/mean": 0.013455265015363694, "step": 2845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1667.8, "completions/max_terminated_length": 1667.8, "completions/mean_length": 1186.8, "completions/mean_terminated_length": 1186.8, "completions/min_length": 892.2, "completions/min_terminated_length": 892.2, "entropy": 0.26246914863586424, "epoch": 3.3490011750881314, "frac_reward_zero_std": 0.55, "grad_norm": 0.5709245204925537, "learning_rate": 1.7034165253210563e-07, "loss": -0.0035, "num_tokens": 385461997.0, "reward": 0.8755208373069763, "reward_std": 0.09409238025546074, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8755208373069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.2078237384557724, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000224113464355, "sampling/importance_sampling_ratio/min": 0.29794262945652006, "sampling/sampling_logp_difference/max": 1.3214488983154298, "sampling/sampling_logp_difference/mean": 0.013476687669754028, "step": 2850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1621.0, "completions/max_terminated_length": 1621.0, "completions/mean_length": 1135.009375, "completions/mean_terminated_length": 1135.009375, "completions/min_length": 795.6, "completions/min_terminated_length": 795.6, "entropy": 0.2704127460718155, "epoch": 3.354876615746181, "frac_reward_zero_std": 0.65, "grad_norm": 0.6261325478553772, "learning_rate": 1.697358856312091e-07, "loss": 0.0004, "num_tokens": 386161392.0, "reward": 0.8057291984558106, "reward_std": 0.04726305603981018, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8057291984558106, "rewards/e2e_recall_precision_mixed_reward/std": 0.2478840470314026, "sampling/importance_sampling_ratio/max": 1.988915705680847, "sampling/importance_sampling_ratio/mean": 1.000031304359436, "sampling/importance_sampling_ratio/min": 0.363059838116169, "sampling/sampling_logp_difference/max": 1.249585008621216, "sampling/sampling_logp_difference/mean": 0.013962916098535061, "step": 2855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1632.8, "completions/max_terminated_length": 1632.8, "completions/mean_length": 1203.0375, "completions/mean_terminated_length": 1203.0375, "completions/min_length": 960.2, "completions/min_terminated_length": 960.2, "entropy": 0.26425559520721437, "epoch": 3.3607520564042304, "frac_reward_zero_std": 0.55, "grad_norm": 0.3815658986568451, "learning_rate": 1.6913011873031259e-07, "loss": -0.0073, "num_tokens": 386859852.0, "reward": 0.8492187619209289, "reward_std": 0.07277037352323532, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8492187619209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.2327181279659271, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999239444732666, "sampling/importance_sampling_ratio/min": 0.2944484859704971, "sampling/sampling_logp_difference/max": 1.2413968324661255, "sampling/sampling_logp_difference/mean": 0.013403261639177799, "step": 2860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1474.4, "completions/max_terminated_length": 1474.4, "completions/mean_length": 1176.075, "completions/mean_terminated_length": 1176.075, "completions/min_length": 923.0, "completions/min_terminated_length": 923.0, "entropy": 0.2643334299325943, "epoch": 3.3666274970622796, "frac_reward_zero_std": 0.7, "grad_norm": 0.6973140835762024, "learning_rate": 1.6852435182941605e-07, "loss": 0.0031, "num_tokens": 387544932.0, "reward": 0.746875, "reward_std": 0.0624865785241127, "rewards/e2e_recall_precision_mixed_reward/mean": 0.746875, "rewards/e2e_recall_precision_mixed_reward/std": 0.2990709781646729, "sampling/importance_sampling_ratio/max": 1.9863412141799928, "sampling/importance_sampling_ratio/mean": 1.000007724761963, "sampling/importance_sampling_ratio/min": 0.39941216707229615, "sampling/sampling_logp_difference/max": 0.9842625856399536, "sampling/sampling_logp_difference/mean": 0.013473628833889962, "step": 2865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1593.0, "completions/max_terminated_length": 1593.0, "completions/mean_length": 1162.63125, "completions/mean_terminated_length": 1162.63125, "completions/min_length": 848.6, "completions/min_terminated_length": 848.6, "entropy": 0.26365244686603545, "epoch": 3.372502937720329, "frac_reward_zero_std": 0.65, "grad_norm": 0.6711568236351013, "learning_rate": 1.6791858492851948e-07, "loss": -0.003, "num_tokens": 388234830.0, "reward": 0.8809895992279053, "reward_std": 0.053984729945659636, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8809895992279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.16809427738189697, "sampling/importance_sampling_ratio/max": 1.95412278175354, "sampling/importance_sampling_ratio/mean": 1.0000274538993836, "sampling/importance_sampling_ratio/min": 0.40029398798942567, "sampling/sampling_logp_difference/max": 0.9596251726150513, "sampling/sampling_logp_difference/mean": 0.01345615666359663, "step": 2870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1530.4, "completions/max_terminated_length": 1530.4, "completions/mean_length": 1171.425, "completions/mean_terminated_length": 1171.425, "completions/min_length": 896.2, "completions/min_terminated_length": 896.2, "entropy": 0.25766043066978456, "epoch": 3.3783783783783785, "frac_reward_zero_std": 0.7, "grad_norm": 0.5919622182846069, "learning_rate": 1.6731281802762295e-07, "loss": 0.0019, "num_tokens": 388903718.0, "reward": 0.9229166865348816, "reward_std": 0.06342011243104935, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9229166865348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.16442330777645112, "sampling/importance_sampling_ratio/max": 1.9627609968185424, "sampling/importance_sampling_ratio/mean": 1.000002133846283, "sampling/importance_sampling_ratio/min": 0.29471515119075775, "sampling/sampling_logp_difference/max": 1.2630483984947205, "sampling/sampling_logp_difference/mean": 0.013212603889405728, "step": 2875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1785.8, "completions/max_terminated_length": 1785.8, "completions/mean_length": 1277.615625, "completions/mean_terminated_length": 1277.615625, "completions/min_length": 892.4, "completions/min_terminated_length": 892.4, "entropy": 0.2642282694578171, "epoch": 3.3842538190364277, "frac_reward_zero_std": 0.65, "grad_norm": 0.0, "learning_rate": 1.667070511267264e-07, "loss": 0.0003, "num_tokens": 389659083.0, "reward": 0.8364583492279053, "reward_std": 0.07245562374591827, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8364583492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.22363150119781494, "sampling/importance_sampling_ratio/max": 1.9598698139190673, "sampling/importance_sampling_ratio/mean": 0.99994957447052, "sampling/importance_sampling_ratio/min": 0.3109358698129654, "sampling/sampling_logp_difference/max": 1.2494572162628175, "sampling/sampling_logp_difference/mean": 0.01340102069079876, "step": 2880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1564.4, "completions/max_terminated_length": 1564.4, "completions/mean_length": 1184.853125, "completions/mean_terminated_length": 1184.853125, "completions/min_length": 883.6, "completions/min_terminated_length": 883.6, "entropy": 0.2694912374019623, "epoch": 3.390129259694477, "frac_reward_zero_std": 0.65, "grad_norm": 0.41767993569374084, "learning_rate": 1.661012842258299e-07, "loss": -0.0038, "num_tokens": 390364700.0, "reward": 0.8158854365348815, "reward_std": 0.06047187112271786, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8158854365348815, "rewards/e2e_recall_precision_mixed_reward/std": 0.29825079143047334, "sampling/importance_sampling_ratio/max": 1.983485460281372, "sampling/importance_sampling_ratio/mean": 1.000064730644226, "sampling/importance_sampling_ratio/min": 0.3307294547557831, "sampling/sampling_logp_difference/max": 1.4649325132369995, "sampling/sampling_logp_difference/mean": 0.01370826605707407, "step": 2885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1892.6, "completions/max_terminated_length": 1892.6, "completions/mean_length": 1323.515625, "completions/mean_terminated_length": 1323.515625, "completions/min_length": 993.6, "completions/min_terminated_length": 993.6, "entropy": 0.28171729743480683, "epoch": 3.3960047003525267, "frac_reward_zero_std": 0.7, "grad_norm": 0.5330216884613037, "learning_rate": 1.6549551732493336e-07, "loss": -0.0033, "num_tokens": 391118577.0, "reward": 0.7697916746139526, "reward_std": 0.0524015374481678, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7697916746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.27035410702228546, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999969756603241, "sampling/importance_sampling_ratio/min": 0.24186867326498032, "sampling/sampling_logp_difference/max": 1.6836973428726196, "sampling/sampling_logp_difference/mean": 0.013766255043447018, "step": 2890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1748.2, "completions/max_terminated_length": 1748.2, "completions/mean_length": 1213.021875, "completions/mean_terminated_length": 1213.021875, "completions/min_length": 896.6, "completions/min_terminated_length": 896.6, "entropy": 0.29188904762268064, "epoch": 3.401880141010576, "frac_reward_zero_std": 0.6, "grad_norm": 0.6556338667869568, "learning_rate": 1.6488975042403683e-07, "loss": -0.0004, "num_tokens": 391879512.0, "reward": 0.8236458420753479, "reward_std": 0.09013433307409287, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8236458420753479, "rewards/e2e_recall_precision_mixed_reward/std": 0.26258903443813325, "sampling/importance_sampling_ratio/max": 1.9666601896286011, "sampling/importance_sampling_ratio/mean": 0.9999792337417602, "sampling/importance_sampling_ratio/min": 0.34474579691886903, "sampling/sampling_logp_difference/max": 1.1234654664993287, "sampling/sampling_logp_difference/mean": 0.014820769429206848, "step": 2895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1670.0, "completions/max_terminated_length": 1670.0, "completions/mean_length": 1213.39375, "completions/mean_terminated_length": 1213.39375, "completions/min_length": 897.6, "completions/min_terminated_length": 897.6, "entropy": 0.28398237824440004, "epoch": 3.407755581668625, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "learning_rate": 1.642839835231403e-07, "loss": -0.0024, "num_tokens": 392588134.0, "reward": 0.8369791746139527, "reward_std": 0.045029500126838685, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8369791746139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.28497442603111267, "sampling/importance_sampling_ratio/max": 1.9755486488342284, "sampling/importance_sampling_ratio/mean": 0.9999412894248962, "sampling/importance_sampling_ratio/min": 0.37593441605567934, "sampling/sampling_logp_difference/max": 1.0409663915634155, "sampling/sampling_logp_difference/mean": 0.01409566793590784, "step": 2900 }, { "epoch": 3.407755581668625, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1682.08, "eval_completions/max_terminated_length": 1682.08, "eval_completions/mean_length": 1202.545, "eval_completions/mean_terminated_length": 1202.545, "eval_completions/min_length": 904.52, "eval_completions/min_terminated_length": 904.52, "eval_entropy": 0.2751586544513702, "eval_frac_reward_zero_std": 0.64, "eval_loss": 0.004005698952823877, "eval_num_tokens": 392588134.0, "eval_reward": 0.7671562600135803, "eval_reward_std": 0.07342575185000896, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7671562600135803, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.2901870983839035, "eval_runtime": 462.7739, "eval_samples_per_second": 0.216, "eval_sampling/importance_sampling_ratio/max": 1.9416738891601562, "eval_sampling/importance_sampling_ratio/mean": 0.9999853873252869, "eval_sampling/importance_sampling_ratio/min": 0.3018297159723193, "eval_sampling/sampling_logp_difference/max": 2.2107747268676756, "eval_sampling/sampling_logp_difference/mean": 0.013962351121008396, "eval_steps_per_second": 0.004, "step": 2900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1622.6, "completions/max_terminated_length": 1622.6, "completions/mean_length": 1219.08125, "completions/mean_terminated_length": 1219.08125, "completions/min_length": 910.8, "completions/min_terminated_length": 910.8, "entropy": 0.27651492953300477, "epoch": 3.4136310223266744, "frac_reward_zero_std": 0.75, "grad_norm": 0.523073673248291, "learning_rate": 1.6367821662224375e-07, "loss": -0.0013, "num_tokens": 393292144.0, "reward": 0.8482812643051147, "reward_std": 0.04793854169547558, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8482812643051147, "rewards/e2e_recall_precision_mixed_reward/std": 0.28270514160394666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999916076660156, "sampling/importance_sampling_ratio/min": 0.2504092216491699, "sampling/sampling_logp_difference/max": 1.4600980758666993, "sampling/sampling_logp_difference/mean": 0.014081763848662377, "step": 2905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1609.6, "completions/max_terminated_length": 1609.6, "completions/mean_length": 1176.940625, "completions/mean_terminated_length": 1176.940625, "completions/min_length": 832.0, "completions/min_terminated_length": 832.0, "entropy": 0.2731807053089142, "epoch": 3.4195064629847236, "frac_reward_zero_std": 0.6, "grad_norm": 0.43564677238464355, "learning_rate": 1.6307244972134721e-07, "loss": -0.0022, "num_tokens": 393994445.0, "reward": 0.8734375, "reward_std": 0.08320709615945816, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8734375, "rewards/e2e_recall_precision_mixed_reward/std": 0.20147269070148469, "sampling/importance_sampling_ratio/max": 1.970658278465271, "sampling/importance_sampling_ratio/mean": 1.0000769376754761, "sampling/importance_sampling_ratio/min": 0.4103114724159241, "sampling/sampling_logp_difference/max": 0.9029069662094116, "sampling/sampling_logp_difference/mean": 0.013669577986001968, "step": 2910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1716.2, "completions/max_terminated_length": 1716.2, "completions/mean_length": 1238.3, "completions/mean_terminated_length": 1238.3, "completions/min_length": 924.8, "completions/min_terminated_length": 924.8, "entropy": 0.27329388856887815, "epoch": 3.4253819036427733, "frac_reward_zero_std": 0.7, "grad_norm": 0.6953060030937195, "learning_rate": 1.6246668282045068e-07, "loss": -0.0019, "num_tokens": 394732077.0, "reward": 0.9040104269981384, "reward_std": 0.042052581906318665, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9040104269981384, "rewards/e2e_recall_precision_mixed_reward/std": 0.17316112592816352, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999583959579468, "sampling/importance_sampling_ratio/min": 0.33822412192821505, "sampling/sampling_logp_difference/max": 1.1926729202270507, "sampling/sampling_logp_difference/mean": 0.013831990212202072, "step": 2915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1761.4, "completions/max_terminated_length": 1761.4, "completions/mean_length": 1285.715625, "completions/mean_terminated_length": 1285.715625, "completions/min_length": 906.4, "completions/min_terminated_length": 906.4, "entropy": 0.2749119311571121, "epoch": 3.4312573443008225, "frac_reward_zero_std": 0.5, "grad_norm": 0.4019206762313843, "learning_rate": 1.6186091591955414e-07, "loss": -0.0004, "num_tokens": 395468258.0, "reward": 0.8369791746139527, "reward_std": 0.10039222538471222, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8369791746139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.24194913506507873, "sampling/importance_sampling_ratio/max": 1.9278610944747925, "sampling/importance_sampling_ratio/mean": 1.000080668926239, "sampling/importance_sampling_ratio/min": 0.30234331358224154, "sampling/sampling_logp_difference/max": 1.7760730504989624, "sampling/sampling_logp_difference/mean": 0.013776698522269726, "step": 2920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1701.4, "completions/max_terminated_length": 1701.4, "completions/mean_length": 1198.328125, "completions/mean_terminated_length": 1198.328125, "completions/min_length": 898.6, "completions/min_terminated_length": 898.6, "entropy": 0.25736156702041624, "epoch": 3.4371327849588718, "frac_reward_zero_std": 0.5, "grad_norm": 0.6544657945632935, "learning_rate": 1.612551490186576e-07, "loss": 0.0055, "num_tokens": 396206859.0, "reward": 0.8437500119209289, "reward_std": 0.08604325987398624, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8437500119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.2119861736893654, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999913215637207, "sampling/importance_sampling_ratio/min": 0.27694354951381683, "sampling/sampling_logp_difference/max": 1.3848939895629884, "sampling/sampling_logp_difference/mean": 0.013181288540363312, "step": 2925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1950.6, "completions/max_terminated_length": 1864.6, "completions/mean_length": 1320.728125, "completions/mean_terminated_length": 1317.262109375, "completions/min_length": 933.6, "completions/min_terminated_length": 933.6, "entropy": 0.2831844985485077, "epoch": 3.4430082256169214, "frac_reward_zero_std": 0.55, "grad_norm": 0.4558798670768738, "learning_rate": 1.6064938211776107e-07, "loss": -0.0071, "num_tokens": 396956608.0, "reward": 0.789843761920929, "reward_std": 0.09386796951293945, "rewards/e2e_recall_precision_mixed_reward/mean": 0.789843761920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.28504001498222353, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999266624450683, "sampling/importance_sampling_ratio/min": 0.26929228343069556, "sampling/sampling_logp_difference/max": 1.7527684926986695, "sampling/sampling_logp_difference/mean": 0.01404192615300417, "step": 2930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1707.6, "completions/max_terminated_length": 1707.6, "completions/mean_length": 1202.7375, "completions/mean_terminated_length": 1202.7375, "completions/min_length": 831.8, "completions/min_terminated_length": 831.8, "entropy": 0.2845179855823517, "epoch": 3.4488836662749707, "frac_reward_zero_std": 0.55, "grad_norm": 0.6161671280860901, "learning_rate": 1.6004361521686456e-07, "loss": -0.0046, "num_tokens": 397661164.0, "reward": 0.8372395992279053, "reward_std": 0.08098205551505089, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8372396051883697, "rewards/e2e_recall_precision_mixed_reward/std": 0.23883938044309616, "sampling/importance_sampling_ratio/max": 1.922814679145813, "sampling/importance_sampling_ratio/mean": 1.0000414729118348, "sampling/importance_sampling_ratio/min": 0.34642059803009034, "sampling/sampling_logp_difference/max": 1.499940037727356, "sampling/sampling_logp_difference/mean": 0.014074294827878475, "step": 2935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1735.0, "completions/max_terminated_length": 1735.0, "completions/mean_length": 1231.615625, "completions/mean_terminated_length": 1231.615625, "completions/min_length": 959.6, "completions/min_terminated_length": 959.6, "entropy": 0.26078474819660186, "epoch": 3.45475910693302, "frac_reward_zero_std": 0.5, "grad_norm": 0.43476438522338867, "learning_rate": 1.5943784831596802e-07, "loss": 0.003, "num_tokens": 398367009.0, "reward": 0.8515625119209289, "reward_std": 0.08800038211047649, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8515625119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.240531849861145, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999879479408265, "sampling/importance_sampling_ratio/min": 0.38882568180561067, "sampling/sampling_logp_difference/max": 1.0931127309799193, "sampling/sampling_logp_difference/mean": 0.013227501325309277, "step": 2940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1821.0, "completions/max_terminated_length": 1821.0, "completions/mean_length": 1271.440625, "completions/mean_terminated_length": 1271.440625, "completions/min_length": 914.0, "completions/min_terminated_length": 914.0, "entropy": 0.28265722990036013, "epoch": 3.460634547591069, "frac_reward_zero_std": 0.7, "grad_norm": 0.43009883165359497, "learning_rate": 1.5883208141507148e-07, "loss": 0.0029, "num_tokens": 399059678.0, "reward": 0.8828125119209289, "reward_std": 0.04852218851447106, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8828125119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.15546741783618928, "sampling/importance_sampling_ratio/max": 1.891363549232483, "sampling/importance_sampling_ratio/mean": 0.9999841690063477, "sampling/importance_sampling_ratio/min": 0.4459798693656921, "sampling/sampling_logp_difference/max": 0.8462659478187561, "sampling/sampling_logp_difference/mean": 0.014082801155745983, "step": 2945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1823.4, "completions/max_terminated_length": 1823.4, "completions/mean_length": 1283.26875, "completions/mean_terminated_length": 1283.26875, "completions/min_length": 919.8, "completions/min_terminated_length": 919.8, "entropy": 0.28141863346099855, "epoch": 3.466509988249119, "frac_reward_zero_std": 0.45, "grad_norm": 0.6030286550521851, "learning_rate": 1.5822631451417492e-07, "loss": 0.0095, "num_tokens": 399779444.0, "reward": 0.7938541889190673, "reward_std": 0.10225731804966927, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7938541889190673, "rewards/e2e_recall_precision_mixed_reward/std": 0.25199161395430564, "sampling/importance_sampling_ratio/max": 1.995455813407898, "sampling/importance_sampling_ratio/mean": 1.0000591158866883, "sampling/importance_sampling_ratio/min": 0.35674205124378205, "sampling/sampling_logp_difference/max": 1.0800941228866576, "sampling/sampling_logp_difference/mean": 0.013914234191179275, "step": 2950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1739.4, "completions/max_terminated_length": 1739.4, "completions/mean_length": 1261.48125, "completions/mean_terminated_length": 1261.48125, "completions/min_length": 909.8, "completions/min_terminated_length": 909.8, "entropy": 0.2674350649118423, "epoch": 3.472385428907168, "frac_reward_zero_std": 0.75, "grad_norm": 0.0, "learning_rate": 1.5762054761327838e-07, "loss": 0.0024, "num_tokens": 400484766.0, "reward": 0.9380208492279053, "reward_std": 0.04721375107765198, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9380208492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.1494040608406067, "sampling/importance_sampling_ratio/max": 1.9453770637512207, "sampling/importance_sampling_ratio/mean": 1.0000848650932312, "sampling/importance_sampling_ratio/min": 0.26389922499656676, "sampling/sampling_logp_difference/max": 1.8547302961349488, "sampling/sampling_logp_difference/mean": 0.013454168103635311, "step": 2955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1712.4, "completions/max_terminated_length": 1712.4, "completions/mean_length": 1213.08125, "completions/mean_terminated_length": 1213.08125, "completions/min_length": 851.0, "completions/min_terminated_length": 851.0, "entropy": 0.25921439528465273, "epoch": 3.4782608695652173, "frac_reward_zero_std": 0.7, "grad_norm": 0.7261385917663574, "learning_rate": 1.5701478071238187e-07, "loss": 0.0014, "num_tokens": 401193560.0, "reward": 0.9666666746139526, "reward_std": 0.04748408943414688, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9666666746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.08946052193641663, "sampling/importance_sampling_ratio/max": 1.9423861503601074, "sampling/importance_sampling_ratio/mean": 1.000016164779663, "sampling/importance_sampling_ratio/min": 0.3426270544528961, "sampling/sampling_logp_difference/max": 1.0901034355163575, "sampling/sampling_logp_difference/mean": 0.013289996609091758, "step": 2960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1657.0, "completions/max_terminated_length": 1657.0, "completions/mean_length": 1260.8875, "completions/mean_terminated_length": 1260.8875, "completions/min_length": 949.2, "completions/min_terminated_length": 949.2, "entropy": 0.28407627046108247, "epoch": 3.484136310223267, "frac_reward_zero_std": 0.8, "grad_norm": 0.5941851139068604, "learning_rate": 1.5640901381148533e-07, "loss": 0.0043, "num_tokens": 401933636.0, "reward": 0.93125, "reward_std": 0.03618033975362778, "rewards/e2e_recall_precision_mixed_reward/mean": 0.93125, "rewards/e2e_recall_precision_mixed_reward/std": 0.13101719617843627, "sampling/importance_sampling_ratio/max": 1.919576144218445, "sampling/importance_sampling_ratio/mean": 0.9998687863349914, "sampling/importance_sampling_ratio/min": 0.36393317878246306, "sampling/sampling_logp_difference/max": 1.128061878681183, "sampling/sampling_logp_difference/mean": 0.014223051071166993, "step": 2965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1554.4, "completions/max_terminated_length": 1554.4, "completions/mean_length": 1201.296875, "completions/mean_terminated_length": 1201.296875, "completions/min_length": 940.8, "completions/min_terminated_length": 940.8, "entropy": 0.2826466590166092, "epoch": 3.490011750881316, "frac_reward_zero_std": 0.65, "grad_norm": 0.749692976474762, "learning_rate": 1.558032469105888e-07, "loss": -0.0038, "num_tokens": 402682451.0, "reward": 0.8328125, "reward_std": 0.06184412464499474, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8328125, "rewards/e2e_recall_precision_mixed_reward/std": 0.2695111095905304, "sampling/importance_sampling_ratio/max": 1.8911922693252563, "sampling/importance_sampling_ratio/mean": 0.9998687863349914, "sampling/importance_sampling_ratio/min": 0.31807301938533783, "sampling/sampling_logp_difference/max": 1.2873031139373778, "sampling/sampling_logp_difference/mean": 0.014129643328487873, "step": 2970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1598.6, "completions/max_terminated_length": 1598.6, "completions/mean_length": 1208.403125, "completions/mean_terminated_length": 1208.403125, "completions/min_length": 891.0, "completions/min_terminated_length": 891.0, "entropy": 0.2606518566608429, "epoch": 3.4958871915393654, "frac_reward_zero_std": 0.6, "grad_norm": 2.265922784805298, "learning_rate": 1.5519748000969226e-07, "loss": 0.0057, "num_tokens": 403390836.0, "reward": 0.9125000238418579, "reward_std": 0.08965013474225998, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9125000238418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.15533160120248796, "sampling/importance_sampling_ratio/max": 1.9447248697280883, "sampling/importance_sampling_ratio/mean": 1.0000758647918702, "sampling/importance_sampling_ratio/min": 0.31950002945959566, "sampling/sampling_logp_difference/max": 1.5232036113739014, "sampling/sampling_logp_difference/mean": 0.013233417831361293, "step": 2975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1818.2, "completions/max_terminated_length": 1818.2, "completions/mean_length": 1310.58125, "completions/mean_terminated_length": 1310.58125, "completions/min_length": 1002.6, "completions/min_terminated_length": 1002.6, "entropy": 0.2939403593540192, "epoch": 3.5017626321974147, "frac_reward_zero_std": 0.5, "grad_norm": 0.6451891660690308, "learning_rate": 1.5459171310879575e-07, "loss": 0.0034, "num_tokens": 404117470.0, "reward": 0.8412500023841858, "reward_std": 0.09634547531604767, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8412500023841858, "rewards/e2e_recall_precision_mixed_reward/std": 0.27206834256649015, "sampling/importance_sampling_ratio/max": 1.98237042427063, "sampling/importance_sampling_ratio/mean": 1.0001010060310365, "sampling/importance_sampling_ratio/min": 0.3473032474517822, "sampling/sampling_logp_difference/max": 1.2496708631515503, "sampling/sampling_logp_difference/mean": 0.01439862884581089, "step": 2980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1741.4, "completions/max_terminated_length": 1741.4, "completions/mean_length": 1230.04375, "completions/mean_terminated_length": 1230.04375, "completions/min_length": 934.8, "completions/min_terminated_length": 934.8, "entropy": 0.2777364790439606, "epoch": 3.507638072855464, "frac_reward_zero_std": 0.6, "grad_norm": 0.47499480843544006, "learning_rate": 1.539859462078992e-07, "loss": -0.0007, "num_tokens": 404824988.0, "reward": 0.8776041746139527, "reward_std": 0.1006715402007103, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8776041746139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.20324081927537918, "sampling/importance_sampling_ratio/max": 1.9307854890823364, "sampling/importance_sampling_ratio/mean": 1.0000447750091552, "sampling/importance_sampling_ratio/min": 0.305920846760273, "sampling/sampling_logp_difference/max": 1.9527714014053346, "sampling/sampling_logp_difference/mean": 0.013740032538771629, "step": 2985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1687.8, "completions/max_terminated_length": 1687.8, "completions/mean_length": 1252.390625, "completions/mean_terminated_length": 1252.390625, "completions/min_length": 951.0, "completions/min_terminated_length": 951.0, "entropy": 0.27992117404937744, "epoch": 3.5135135135135136, "frac_reward_zero_std": 0.55, "grad_norm": 0.0, "learning_rate": 1.5338017930700267e-07, "loss": 0.0011, "num_tokens": 405562777.0, "reward": 0.7064583420753479, "reward_std": 0.07313631623983383, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7064583420753479, "rewards/e2e_recall_precision_mixed_reward/std": 0.37596028447151186, "sampling/importance_sampling_ratio/max": 1.9501430988311768, "sampling/importance_sampling_ratio/mean": 0.9999474048614502, "sampling/importance_sampling_ratio/min": 0.392292720079422, "sampling/sampling_logp_difference/max": 1.0178369522094726, "sampling/sampling_logp_difference/mean": 0.014073196426033973, "step": 2990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1640.4, "completions/max_terminated_length": 1640.4, "completions/mean_length": 1195.1, "completions/mean_terminated_length": 1195.1, "completions/min_length": 846.8, "completions/min_terminated_length": 846.8, "entropy": 0.2721045553684235, "epoch": 3.519388954171563, "frac_reward_zero_std": 0.6, "grad_norm": 0.31242266297340393, "learning_rate": 1.527744124061061e-07, "loss": -0.0014, "num_tokens": 406274617.0, "reward": 0.8744791746139526, "reward_std": 0.06375965140759945, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8744791746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.21375257074832915, "sampling/importance_sampling_ratio/max": 1.9984694957733153, "sampling/importance_sampling_ratio/mean": 1.0000406622886657, "sampling/importance_sampling_ratio/min": 0.3253972053527832, "sampling/sampling_logp_difference/max": 1.2722053050994873, "sampling/sampling_logp_difference/mean": 0.013604764640331269, "step": 2995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1595.6, "completions/max_terminated_length": 1595.6, "completions/mean_length": 1300.559375, "completions/mean_terminated_length": 1300.559375, "completions/min_length": 1036.8, "completions/min_terminated_length": 1036.8, "entropy": 0.2830276906490326, "epoch": 3.525264394829612, "frac_reward_zero_std": 0.6, "grad_norm": 0.33567795157432556, "learning_rate": 1.5216864550520957e-07, "loss": -0.0012, "num_tokens": 406998252.0, "reward": 0.91015625, "reward_std": 0.08269033730030059, "rewards/e2e_recall_precision_mixed_reward/mean": 0.91015625, "rewards/e2e_recall_precision_mixed_reward/std": 0.18473040610551833, "sampling/importance_sampling_ratio/max": 1.908703327178955, "sampling/importance_sampling_ratio/mean": 1.000068771839142, "sampling/importance_sampling_ratio/min": 0.4005335092544556, "sampling/sampling_logp_difference/max": 0.9637721180915833, "sampling/sampling_logp_difference/mean": 0.013768712431192398, "step": 3000 }, { "epoch": 3.525264394829612, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1676.92, "eval_completions/max_terminated_length": 1676.92, "eval_completions/mean_length": 1212.78375, "eval_completions/mean_terminated_length": 1212.78375, "eval_completions/min_length": 909.88, "eval_completions/min_terminated_length": 909.88, "eval_entropy": 0.2812558990716934, "eval_frac_reward_zero_std": 0.6, "eval_loss": 0.004141798242926598, "eval_num_tokens": 406998252.0, "eval_reward": 0.7675416767597198, "eval_reward_std": 0.07886279493570328, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7675416767597198, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.297162811756134, "eval_runtime": 460.2742, "eval_samples_per_second": 0.217, "eval_sampling/importance_sampling_ratio/max": 1.9591137409210204, "eval_sampling/importance_sampling_ratio/mean": 1.0000136399269104, "eval_sampling/importance_sampling_ratio/min": 0.34440110325813295, "eval_sampling/sampling_logp_difference/max": 1.1788950943946839, "eval_sampling/sampling_logp_difference/mean": 0.013986198827624322, "eval_steps_per_second": 0.004, "step": 3000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1866.8, "completions/max_terminated_length": 1866.8, "completions/mean_length": 1259.075, "completions/mean_terminated_length": 1259.075, "completions/min_length": 962.8, "completions/min_terminated_length": 962.8, "entropy": 0.2702631801366806, "epoch": 3.5311398354876617, "frac_reward_zero_std": 0.55, "grad_norm": 0.5860521793365479, "learning_rate": 1.5156287860431306e-07, "loss": 0.0013, "num_tokens": 407717204.0, "reward": 0.8221354365348816, "reward_std": 0.10383418351411819, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8221354365348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.27810312807559967, "sampling/importance_sampling_ratio/max": 1.9322570323944093, "sampling/importance_sampling_ratio/mean": 1.000062108039856, "sampling/importance_sampling_ratio/min": 0.40129616260528567, "sampling/sampling_logp_difference/max": 1.0225771427154542, "sampling/sampling_logp_difference/mean": 0.013545482978224754, "step": 3005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1716.8, "completions/max_terminated_length": 1716.8, "completions/mean_length": 1246.384375, "completions/mean_terminated_length": 1246.384375, "completions/min_length": 847.8, "completions/min_terminated_length": 847.8, "entropy": 0.2856502890586853, "epoch": 3.537015276145711, "frac_reward_zero_std": 0.45, "grad_norm": 0.5813427567481995, "learning_rate": 1.5095711170341652e-07, "loss": 0.0047, "num_tokens": 408432751.0, "reward": 0.8304687738418579, "reward_std": 0.11719481796026229, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8304687738418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.27664896547794343, "sampling/importance_sampling_ratio/max": 1.9686694860458374, "sampling/importance_sampling_ratio/mean": 1.0000694751739503, "sampling/importance_sampling_ratio/min": 0.31002587229013445, "sampling/sampling_logp_difference/max": 1.3398897409439088, "sampling/sampling_logp_difference/mean": 0.014048396609723568, "step": 3010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1645.0, "completions/max_terminated_length": 1645.0, "completions/mean_length": 1196.290625, "completions/mean_terminated_length": 1196.290625, "completions/min_length": 942.4, "completions/min_terminated_length": 942.4, "entropy": 0.25909319818019866, "epoch": 3.54289071680376, "frac_reward_zero_std": 0.75, "grad_norm": 0.0, "learning_rate": 1.5035134480252e-07, "loss": -0.0012, "num_tokens": 409130076.0, "reward": 0.9203125238418579, "reward_std": 0.04663856625556946, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9203125238418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.13417750522494315, "sampling/importance_sampling_ratio/max": 1.9461827516555785, "sampling/importance_sampling_ratio/mean": 0.9999265551567078, "sampling/importance_sampling_ratio/min": 0.4522597312927246, "sampling/sampling_logp_difference/max": 0.8987068176269531, "sampling/sampling_logp_difference/mean": 0.012898856587707997, "step": 3015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1866.8, "completions/max_terminated_length": 1866.8, "completions/mean_length": 1327.4375, "completions/mean_terminated_length": 1327.4375, "completions/min_length": 1032.6, "completions/min_terminated_length": 1032.6, "entropy": 0.2696498155593872, "epoch": 3.5487661574618095, "frac_reward_zero_std": 0.45, "grad_norm": 0.5800609588623047, "learning_rate": 1.4974557790162345e-07, "loss": 0.0039, "num_tokens": 409860904.0, "reward": 0.8716145992279053, "reward_std": 0.10067193508148194, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8716145992279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.19466981887817383, "sampling/importance_sampling_ratio/max": 1.9492676258087158, "sampling/importance_sampling_ratio/mean": 1.0000947833061218, "sampling/importance_sampling_ratio/min": 0.3709883391857147, "sampling/sampling_logp_difference/max": 1.1085660457611084, "sampling/sampling_logp_difference/mean": 0.013166314736008645, "step": 3020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1770.6, "completions/max_terminated_length": 1770.6, "completions/mean_length": 1211.534375, "completions/mean_terminated_length": 1211.534375, "completions/min_length": 828.0, "completions/min_terminated_length": 828.0, "entropy": 0.2818950593471527, "epoch": 3.554641598119859, "frac_reward_zero_std": 0.45, "grad_norm": 0.4593271315097809, "learning_rate": 1.491398110007269e-07, "loss": -0.0063, "num_tokens": 410592131.0, "reward": 0.8971354365348816, "reward_std": 0.09658310562372208, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8971354365348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.19376842826604843, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000097036361695, "sampling/importance_sampling_ratio/min": 0.3906724154949188, "sampling/sampling_logp_difference/max": 0.9618961811065674, "sampling/sampling_logp_difference/mean": 0.014005866460502147, "step": 3025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1513.6, "completions/max_terminated_length": 1513.6, "completions/mean_length": 1179.8375, "completions/mean_terminated_length": 1179.8375, "completions/min_length": 872.2, "completions/min_terminated_length": 872.2, "entropy": 0.25830590128898623, "epoch": 3.5605170387779084, "frac_reward_zero_std": 0.65, "grad_norm": 0.4542413353919983, "learning_rate": 1.485340440998304e-07, "loss": -0.0012, "num_tokens": 411270911.0, "reward": 0.8885416746139526, "reward_std": 0.07167095690965652, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8885416746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.17076411694288254, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001612901687622, "sampling/importance_sampling_ratio/min": 0.38393630981445315, "sampling/sampling_logp_difference/max": 1.1111455321311952, "sampling/sampling_logp_difference/mean": 0.013075116835534573, "step": 3030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1862.6, "completions/max_terminated_length": 1852.8, "completions/mean_length": 1280.84375, "completions/mean_terminated_length": 1263.3064208984374, "completions/min_length": 963.8, "completions/min_terminated_length": 963.8, "entropy": 0.28606864213943484, "epoch": 3.5663924794359576, "frac_reward_zero_std": 0.5, "grad_norm": 0.7895479798316956, "learning_rate": 1.4792827719893384e-07, "loss": -0.0009, "num_tokens": 411973449.0, "reward": 0.9044791698455811, "reward_std": 0.10300752446055413, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9044791698455811, "rewards/e2e_recall_precision_mixed_reward/std": 0.18286249935626983, "sampling/importance_sampling_ratio/max": 1.9817769050598144, "sampling/importance_sampling_ratio/mean": 1.0001332998275756, "sampling/importance_sampling_ratio/min": 0.44017385244369506, "sampling/sampling_logp_difference/max": 0.8357086300849914, "sampling/sampling_logp_difference/mean": 0.013966779969632626, "step": 3035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1691.8, "completions/max_terminated_length": 1691.8, "completions/mean_length": 1224.190625, "completions/mean_terminated_length": 1224.190625, "completions/min_length": 903.6, "completions/min_terminated_length": 903.6, "entropy": 0.28389262557029726, "epoch": 3.5722679200940073, "frac_reward_zero_std": 0.45, "grad_norm": 0.7468788623809814, "learning_rate": 1.473225102980373e-07, "loss": -0.0036, "num_tokens": 412704646.0, "reward": 0.768750011920929, "reward_std": 0.09476765841245652, "rewards/e2e_recall_precision_mixed_reward/mean": 0.768750011920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.27755117118358613, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0, "sampling/importance_sampling_ratio/min": 0.33260057866573334, "sampling/sampling_logp_difference/max": 1.1722631096839904, "sampling/sampling_logp_difference/mean": 0.014068802073597907, "step": 3040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1792.6, "completions/max_terminated_length": 1792.6, "completions/mean_length": 1272.9375, "completions/mean_terminated_length": 1272.9375, "completions/min_length": 977.2, "completions/min_terminated_length": 977.2, "entropy": 0.282851442694664, "epoch": 3.5781433607520565, "frac_reward_zero_std": 0.65, "grad_norm": 0.0, "learning_rate": 1.4671674339714076e-07, "loss": 0.0014, "num_tokens": 413436114.0, "reward": 0.7989583492279053, "reward_std": 0.09803546294569969, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7989583492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.28233606219291685, "sampling/importance_sampling_ratio/max": 1.9849604606628417, "sampling/importance_sampling_ratio/mean": 0.9999540328979493, "sampling/importance_sampling_ratio/min": 0.30122940987348557, "sampling/sampling_logp_difference/max": 1.4530311226844788, "sampling/sampling_logp_difference/mean": 0.01404977347701788, "step": 3045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1722.4, "completions/max_terminated_length": 1722.4, "completions/mean_length": 1213.70625, "completions/mean_terminated_length": 1213.70625, "completions/min_length": 810.0, "completions/min_terminated_length": 810.0, "entropy": 0.28000237941741946, "epoch": 3.5840188014101058, "frac_reward_zero_std": 0.55, "grad_norm": 0.7278285622596741, "learning_rate": 1.4611097649624423e-07, "loss": -0.002, "num_tokens": 414138340.0, "reward": 0.8671875, "reward_std": 0.0896983802318573, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8671875119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.24096697121858596, "sampling/importance_sampling_ratio/max": 1.9819402933120727, "sampling/importance_sampling_ratio/mean": 0.9999994993209839, "sampling/importance_sampling_ratio/min": 0.3462633116170764, "sampling/sampling_logp_difference/max": 1.7581159114837646, "sampling/sampling_logp_difference/mean": 0.013958721049129963, "step": 3050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009375, "completions/max_length": 1755.4, "completions/max_terminated_length": 1719.8, "completions/mean_length": 1299.853125, "completions/mean_terminated_length": 1289.995361328125, "completions/min_length": 955.6, "completions/min_terminated_length": 955.6, "entropy": 0.2778258055448532, "epoch": 3.589894242068155, "frac_reward_zero_std": 0.55, "grad_norm": 0.42271292209625244, "learning_rate": 1.4550520959534772e-07, "loss": -0.0104, "num_tokens": 414857417.0, "reward": 0.7609375238418579, "reward_std": 0.07389688491821289, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7609375238418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.29788713455200194, "sampling/importance_sampling_ratio/max": 1.9975370168685913, "sampling/importance_sampling_ratio/mean": 1.0000292778015136, "sampling/importance_sampling_ratio/min": 0.32156047150492667, "sampling/sampling_logp_difference/max": 1.5476860523223877, "sampling/sampling_logp_difference/mean": 0.013813853822648525, "step": 3055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1609.8, "completions/max_terminated_length": 1609.8, "completions/mean_length": 1203.35625, "completions/mean_terminated_length": 1203.35625, "completions/min_length": 942.4, "completions/min_terminated_length": 942.4, "entropy": 0.27206062972545625, "epoch": 3.5957696827262042, "frac_reward_zero_std": 0.5, "grad_norm": 0.7698452472686768, "learning_rate": 1.4489944269445118e-07, "loss": 0.0086, "num_tokens": 415559611.0, "reward": 0.8005208611488343, "reward_std": 0.09430278986692428, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8005208611488343, "rewards/e2e_recall_precision_mixed_reward/std": 0.28068689703941346, "sampling/importance_sampling_ratio/max": 1.9747754335403442, "sampling/importance_sampling_ratio/mean": 0.999997878074646, "sampling/importance_sampling_ratio/min": 0.3823790907859802, "sampling/sampling_logp_difference/max": 1.0575198411941529, "sampling/sampling_logp_difference/mean": 0.01356248427182436, "step": 3060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1615.8, "completions/max_terminated_length": 1615.8, "completions/mean_length": 1229.265625, "completions/mean_terminated_length": 1229.265625, "completions/min_length": 946.2, "completions/min_terminated_length": 946.2, "entropy": 0.28500559329986574, "epoch": 3.601645123384254, "frac_reward_zero_std": 0.4, "grad_norm": 0.7499962449073792, "learning_rate": 1.4429367579355464e-07, "loss": 0.0021, "num_tokens": 416258560.0, "reward": 0.8640625238418579, "reward_std": 0.12375695258378983, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8640625238418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.24018340706825256, "sampling/importance_sampling_ratio/max": 1.937759232521057, "sampling/importance_sampling_ratio/mean": 0.9999950051307678, "sampling/importance_sampling_ratio/min": 0.37119474411010744, "sampling/sampling_logp_difference/max": 1.0601333975791931, "sampling/sampling_logp_difference/mean": 0.014097846299409866, "step": 3065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1569.8, "completions/max_terminated_length": 1569.8, "completions/mean_length": 1160.625, "completions/mean_terminated_length": 1160.625, "completions/min_length": 896.8, "completions/min_terminated_length": 896.8, "entropy": 0.25845094621181486, "epoch": 3.607520564042303, "frac_reward_zero_std": 0.5, "grad_norm": 0.5670578479766846, "learning_rate": 1.436879088926581e-07, "loss": -0.0026, "num_tokens": 416934696.0, "reward": 0.8625000238418579, "reward_std": 0.10116852670907975, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8625000238418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.24681516885757446, "sampling/importance_sampling_ratio/max": 1.9113659143447876, "sampling/importance_sampling_ratio/mean": 1.0000329732894897, "sampling/importance_sampling_ratio/min": 0.39745662212371824, "sampling/sampling_logp_difference/max": 0.9369120836257935, "sampling/sampling_logp_difference/mean": 0.012975651957094669, "step": 3070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1579.6, "completions/max_terminated_length": 1579.6, "completions/mean_length": 1190.0125, "completions/mean_terminated_length": 1190.0125, "completions/min_length": 852.0, "completions/min_terminated_length": 852.0, "entropy": 0.28933039903640745, "epoch": 3.6133960047003524, "frac_reward_zero_std": 0.65, "grad_norm": 0.7078462243080139, "learning_rate": 1.4308214199176154e-07, "loss": 0.0015, "num_tokens": 417633500.0, "reward": 0.9078125119209289, "reward_std": 0.07728993520140648, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9078125119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.1712805688381195, "sampling/importance_sampling_ratio/max": 1.9024327516555786, "sampling/importance_sampling_ratio/mean": 0.9999622344970703, "sampling/importance_sampling_ratio/min": 0.3870242595672607, "sampling/sampling_logp_difference/max": 0.9816065073013306, "sampling/sampling_logp_difference/mean": 0.01414424292743206, "step": 3075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 1763.8, "completions/max_terminated_length": 1761.4, "completions/mean_length": 1259.35625, "completions/mean_terminated_length": 1245.55146484375, "completions/min_length": 953.0, "completions/min_terminated_length": 953.0, "entropy": 0.2990762531757355, "epoch": 3.619271445358402, "frac_reward_zero_std": 0.5, "grad_norm": 0.47593507170677185, "learning_rate": 1.4247637509086503e-07, "loss": -0.0174, "num_tokens": 418346446.0, "reward": 0.9101562738418579, "reward_std": 0.11092746555805207, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9101562738418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.20071874484419822, "sampling/importance_sampling_ratio/max": 1.9293018341064454, "sampling/importance_sampling_ratio/mean": 0.9999398589134216, "sampling/importance_sampling_ratio/min": 0.31697064116597173, "sampling/sampling_logp_difference/max": 1.6223263263702392, "sampling/sampling_logp_difference/mean": 0.01478542685508728, "step": 3080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1825.0, "completions/max_terminated_length": 1674.8, "completions/mean_length": 1224.38125, "completions/mean_terminated_length": 1220.5604736328125, "completions/min_length": 927.4, "completions/min_terminated_length": 927.4, "entropy": 0.2816446602344513, "epoch": 3.6251468860164513, "frac_reward_zero_std": 0.55, "grad_norm": 0.45582300424575806, "learning_rate": 1.418706081899685e-07, "loss": -0.0189, "num_tokens": 419068628.0, "reward": 0.879687511920929, "reward_std": 0.07607024312019348, "rewards/e2e_recall_precision_mixed_reward/mean": 0.879687511920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.1917457178235054, "sampling/importance_sampling_ratio/max": 1.9380622863769532, "sampling/importance_sampling_ratio/mean": 0.9999940156936645, "sampling/importance_sampling_ratio/min": 0.38542511463165285, "sampling/sampling_logp_difference/max": 0.9764352560043335, "sampling/sampling_logp_difference/mean": 0.013983016833662987, "step": 3085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1821.0, "completions/max_terminated_length": 1821.0, "completions/mean_length": 1329.95625, "completions/mean_terminated_length": 1329.95625, "completions/min_length": 977.6, "completions/min_terminated_length": 977.6, "entropy": 0.26992570161819457, "epoch": 3.6310223266745005, "frac_reward_zero_std": 0.55, "grad_norm": 0.4041450619697571, "learning_rate": 1.4126484128907196e-07, "loss": -0.0004, "num_tokens": 419780486.0, "reward": 0.7979166746139527, "reward_std": 0.09113014414906502, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7979166746139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.3225852161645889, "sampling/importance_sampling_ratio/max": 1.9687642812728883, "sampling/importance_sampling_ratio/mean": 1.0000601291656495, "sampling/importance_sampling_ratio/min": 0.3267024874687195, "sampling/sampling_logp_difference/max": 1.3370716214179992, "sampling/sampling_logp_difference/mean": 0.013374082185328007, "step": 3090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1923.0, "completions/max_terminated_length": 1923.0, "completions/mean_length": 1279.303125, "completions/mean_terminated_length": 1279.303125, "completions/min_length": 959.6, "completions/min_terminated_length": 959.6, "entropy": 0.29384331703186034, "epoch": 3.6368977673325498, "frac_reward_zero_std": 0.6, "grad_norm": 0.37273073196411133, "learning_rate": 1.4065907438817542e-07, "loss": 0.0016, "num_tokens": 420497159.0, "reward": 0.8317708492279052, "reward_std": 0.07642179653048516, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8317708492279052, "rewards/e2e_recall_precision_mixed_reward/std": 0.14986341148614885, "sampling/importance_sampling_ratio/max": 1.9512465000152588, "sampling/importance_sampling_ratio/mean": 1.0000032186508179, "sampling/importance_sampling_ratio/min": 0.3234915256500244, "sampling/sampling_logp_difference/max": 1.1340311884880065, "sampling/sampling_logp_difference/mean": 0.014189736545085907, "step": 3095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1810.6, "completions/max_terminated_length": 1810.6, "completions/mean_length": 1364.653125, "completions/mean_terminated_length": 1364.653125, "completions/min_length": 987.0, "completions/min_terminated_length": 987.0, "entropy": 0.2985062301158905, "epoch": 3.6427732079905994, "frac_reward_zero_std": 0.55, "grad_norm": 0.40833938121795654, "learning_rate": 1.4005330748727888e-07, "loss": 0.0033, "num_tokens": 421275000.0, "reward": 0.8166146039962768, "reward_std": 0.07883021160960198, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8166146039962768, "rewards/e2e_recall_precision_mixed_reward/std": 0.2833401970565319, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000027179718018, "sampling/importance_sampling_ratio/min": 0.19541268646717072, "sampling/sampling_logp_difference/max": 1.775990653038025, "sampling/sampling_logp_difference/mean": 0.014779189042747021, "step": 3100 }, { "epoch": 3.6427732079905994, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1717.16, "eval_completions/max_terminated_length": 1717.16, "eval_completions/mean_length": 1257.2675, "eval_completions/mean_terminated_length": 1257.2675, "eval_completions/min_length": 936.28, "eval_completions/min_terminated_length": 936.28, "eval_entropy": 0.29172040104866026, "eval_frac_reward_zero_std": 0.58, "eval_loss": -0.0004253547522239387, "eval_num_tokens": 421275000.0, "eval_reward": 0.7633229267597198, "eval_reward_std": 0.0836981363594532, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7633229267597198, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.2984502202272415, "eval_runtime": 473.8791, "eval_samples_per_second": 0.211, "eval_sampling/importance_sampling_ratio/max": 1.9595999717712402, "eval_sampling/importance_sampling_ratio/mean": 0.999986469745636, "eval_sampling/importance_sampling_ratio/min": 0.3356508800573647, "eval_sampling/sampling_logp_difference/max": 1.4490785884857178, "eval_sampling/sampling_logp_difference/mean": 0.01437144923955202, "eval_steps_per_second": 0.004, "step": 3100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1738.0, "completions/max_terminated_length": 1738.0, "completions/mean_length": 1287.340625, "completions/mean_terminated_length": 1287.340625, "completions/min_length": 956.6, "completions/min_terminated_length": 956.6, "entropy": 0.30881457328796386, "epoch": 3.6486486486486487, "frac_reward_zero_std": 0.45, "grad_norm": 0.34845200181007385, "learning_rate": 1.3944754058638237e-07, "loss": 0.0014, "num_tokens": 421994277.0, "reward": 0.7625, "reward_std": 0.10787137746810913, "rewards/e2e_recall_precision_mixed_reward/mean": 0.762500011920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.2934652417898178, "sampling/importance_sampling_ratio/max": 1.8299882888793946, "sampling/importance_sampling_ratio/mean": 0.9999529242515564, "sampling/importance_sampling_ratio/min": 0.32334981858730316, "sampling/sampling_logp_difference/max": 1.260542106628418, "sampling/sampling_logp_difference/mean": 0.015047940611839294, "step": 3105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1736.6, "completions/max_terminated_length": 1736.6, "completions/mean_length": 1294.334375, "completions/mean_terminated_length": 1294.334375, "completions/min_length": 1006.8, "completions/min_terminated_length": 1006.8, "entropy": 0.295719313621521, "epoch": 3.654524089306698, "frac_reward_zero_std": 0.5, "grad_norm": 0.37620779871940613, "learning_rate": 1.3884177368548583e-07, "loss": 0.0024, "num_tokens": 422722320.0, "reward": 0.8888020873069763, "reward_std": 0.08835629969835282, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8888020873069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.17617078721523285, "sampling/importance_sampling_ratio/max": 1.8987627744674682, "sampling/importance_sampling_ratio/mean": 0.9999510288238526, "sampling/importance_sampling_ratio/min": 0.39632954001426696, "sampling/sampling_logp_difference/max": 1.019751000404358, "sampling/sampling_logp_difference/mean": 0.014330669678747654, "step": 3110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1861.6, "completions/max_terminated_length": 1861.6, "completions/mean_length": 1327.196875, "completions/mean_terminated_length": 1327.196875, "completions/min_length": 927.8, "completions/min_terminated_length": 927.8, "entropy": 0.30390411615371704, "epoch": 3.6603995299647476, "frac_reward_zero_std": 0.65, "grad_norm": 0.3971744179725647, "learning_rate": 1.3823600678458927e-07, "loss": -0.0021, "num_tokens": 423456351.0, "reward": 0.8244791805744172, "reward_std": 0.04826573207974434, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8244791865348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.155859262496233, "sampling/importance_sampling_ratio/max": 1.949301791191101, "sampling/importance_sampling_ratio/mean": 0.9999527335166931, "sampling/importance_sampling_ratio/min": 0.40473890900611875, "sampling/sampling_logp_difference/max": 1.2450590133666992, "sampling/sampling_logp_difference/mean": 0.014594112709164619, "step": 3115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1975.2, "completions/max_terminated_length": 1975.2, "completions/mean_length": 1400.11875, "completions/mean_terminated_length": 1400.11875, "completions/min_length": 996.2, "completions/min_terminated_length": 996.2, "entropy": 0.29585008025169374, "epoch": 3.666274970622797, "frac_reward_zero_std": 0.55, "grad_norm": 0.6170745491981506, "learning_rate": 1.3763023988369273e-07, "loss": 0.0028, "num_tokens": 424206165.0, "reward": 0.8633333444595337, "reward_std": 0.08815523274242879, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8633333444595337, "rewards/e2e_recall_precision_mixed_reward/std": 0.24421373009681702, "sampling/importance_sampling_ratio/max": 1.9507896423339843, "sampling/importance_sampling_ratio/mean": 1.0000440001487731, "sampling/importance_sampling_ratio/min": 0.3013356953859329, "sampling/sampling_logp_difference/max": 1.284391450881958, "sampling/sampling_logp_difference/mean": 0.01431298851966858, "step": 3120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1929.2, "completions/max_terminated_length": 1880.8, "completions/mean_length": 1353.30625, "completions/mean_terminated_length": 1350.4684326171875, "completions/min_length": 1014.6, "completions/min_terminated_length": 1014.6, "entropy": 0.29437545537948606, "epoch": 3.672150411280846, "frac_reward_zero_std": 0.65, "grad_norm": 0.7374795079231262, "learning_rate": 1.370244729827962e-07, "loss": 0.0048, "num_tokens": 424945763.0, "reward": 0.8335937619209289, "reward_std": 0.08211355954408646, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8335937619209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.2667385458946228, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000265955924987, "sampling/importance_sampling_ratio/min": 0.2846124005503953, "sampling/sampling_logp_difference/max": 2.374808597564697, "sampling/sampling_logp_difference/mean": 0.014504742994904517, "step": 3125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1820.4, "completions/max_terminated_length": 1820.4, "completions/mean_length": 1330.065625, "completions/mean_terminated_length": 1330.065625, "completions/min_length": 988.8, "completions/min_terminated_length": 988.8, "entropy": 0.2882347762584686, "epoch": 3.6780258519388953, "frac_reward_zero_std": 0.55, "grad_norm": 0.58107990026474, "learning_rate": 1.3641870608189969e-07, "loss": -0.0016, "num_tokens": 425713832.0, "reward": 0.8757812738418579, "reward_std": 0.08430513888597488, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8757812738418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.21622689962387084, "sampling/importance_sampling_ratio/max": 1.990927243232727, "sampling/importance_sampling_ratio/mean": 0.9999788165092468, "sampling/importance_sampling_ratio/min": 0.3501891404390335, "sampling/sampling_logp_difference/max": 1.1427728176116942, "sampling/sampling_logp_difference/mean": 0.013877778686583042, "step": 3130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1819.2, "completions/max_terminated_length": 1819.2, "completions/mean_length": 1303.94375, "completions/mean_terminated_length": 1303.94375, "completions/min_length": 986.8, "completions/min_terminated_length": 986.8, "entropy": 0.2957104444503784, "epoch": 3.6839012925969445, "frac_reward_zero_std": 0.65, "grad_norm": 0.0, "learning_rate": 1.3581293918100315e-07, "loss": 0.0038, "num_tokens": 426498822.0, "reward": 0.760937511920929, "reward_std": 0.06861766874790191, "rewards/e2e_recall_precision_mixed_reward/mean": 0.760937511920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.401876425743103, "sampling/importance_sampling_ratio/max": 1.9452528476715087, "sampling/importance_sampling_ratio/mean": 0.9999453067779541, "sampling/importance_sampling_ratio/min": 0.2957945063710213, "sampling/sampling_logp_difference/max": 1.479243540763855, "sampling/sampling_logp_difference/mean": 0.014747031778097153, "step": 3135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1718.2, "completions/max_terminated_length": 1718.2, "completions/mean_length": 1282.125, "completions/mean_terminated_length": 1282.125, "completions/min_length": 988.6, "completions/min_terminated_length": 988.6, "entropy": 0.28800985813140867, "epoch": 3.6897767332549942, "frac_reward_zero_std": 0.6, "grad_norm": 0.5976863503456116, "learning_rate": 1.352071722801066e-07, "loss": -0.002, "num_tokens": 427214238.0, "reward": 0.8390625238418579, "reward_std": 0.06042231023311615, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8390625238418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.28592842221260073, "sampling/importance_sampling_ratio/max": 1.9544946670532226, "sampling/importance_sampling_ratio/mean": 0.9999897599220275, "sampling/importance_sampling_ratio/min": 0.2928850159049034, "sampling/sampling_logp_difference/max": 1.3969106674194336, "sampling/sampling_logp_difference/mean": 0.014067772217094899, "step": 3140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1836.0, "completions/max_terminated_length": 1836.0, "completions/mean_length": 1300.1125, "completions/mean_terminated_length": 1300.1125, "completions/min_length": 907.2, "completions/min_terminated_length": 907.2, "entropy": 0.28729010820388795, "epoch": 3.6956521739130435, "frac_reward_zero_std": 0.4, "grad_norm": 0.8007182478904724, "learning_rate": 1.3460140537921007e-07, "loss": -0.0018, "num_tokens": 427997730.0, "reward": 0.8020833492279053, "reward_std": 0.13551612198352814, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8020833492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.2759165666997433, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000725269317627, "sampling/importance_sampling_ratio/min": 0.20970812886953355, "sampling/sampling_logp_difference/max": 1.6685986518859863, "sampling/sampling_logp_difference/mean": 0.014266648329794407, "step": 3145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1675.2, "completions/max_terminated_length": 1675.2, "completions/mean_length": 1290.090625, "completions/mean_terminated_length": 1290.090625, "completions/min_length": 965.8, "completions/min_terminated_length": 965.8, "entropy": 0.28130186200141905, "epoch": 3.7015276145710927, "frac_reward_zero_std": 0.65, "grad_norm": 0.35631677508354187, "learning_rate": 1.3399563847831354e-07, "loss": -0.0013, "num_tokens": 428694863.0, "reward": 0.8708854198455811, "reward_std": 0.05269531458616257, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8708854198455811, "rewards/e2e_recall_precision_mixed_reward/std": 0.22914678156375884, "sampling/importance_sampling_ratio/max": 1.9211535692214965, "sampling/importance_sampling_ratio/mean": 0.9999082922935486, "sampling/importance_sampling_ratio/min": 0.37740443348884584, "sampling/sampling_logp_difference/max": 1.008909249305725, "sampling/sampling_logp_difference/mean": 0.013886995241045951, "step": 3150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1931.8, "completions/max_terminated_length": 1931.8, "completions/mean_length": 1323.0625, "completions/mean_terminated_length": 1323.0625, "completions/min_length": 911.6, "completions/min_terminated_length": 911.6, "entropy": 0.3171755850315094, "epoch": 3.7074030552291424, "frac_reward_zero_std": 0.65, "grad_norm": 0.7349398136138916, "learning_rate": 1.33389871577417e-07, "loss": -0.0039, "num_tokens": 429435427.0, "reward": 0.8898437738418579, "reward_std": 0.04950306043028831, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8898437738418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.176000939309597, "sampling/importance_sampling_ratio/max": 1.988955020904541, "sampling/importance_sampling_ratio/mean": 0.9999869346618653, "sampling/importance_sampling_ratio/min": 0.4141034007072449, "sampling/sampling_logp_difference/max": 0.8982202529907226, "sampling/sampling_logp_difference/mean": 0.015075892955064774, "step": 3155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1743.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 1276.153125, "completions/mean_terminated_length": 1276.153125, "completions/min_length": 955.2, "completions/min_terminated_length": 955.2, "entropy": 0.2771688997745514, "epoch": 3.7132784958871916, "frac_reward_zero_std": 0.6, "grad_norm": 0.5998103618621826, "learning_rate": 1.3278410467652046e-07, "loss": -0.0041, "num_tokens": 430172244.0, "reward": 0.9096354246139526, "reward_std": 0.0633207380771637, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9096354246139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.16724707335233688, "sampling/importance_sampling_ratio/max": 1.961449098587036, "sampling/importance_sampling_ratio/mean": 0.9999847888946534, "sampling/importance_sampling_ratio/min": 0.3520124971866608, "sampling/sampling_logp_difference/max": 1.2267968893051147, "sampling/sampling_logp_difference/mean": 0.013828900456428529, "step": 3160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1734.0, "completions/max_terminated_length": 1734.0, "completions/mean_length": 1276.95, "completions/mean_terminated_length": 1276.95, "completions/min_length": 936.2, "completions/min_terminated_length": 936.2, "entropy": 0.2937505543231964, "epoch": 3.719153936545241, "frac_reward_zero_std": 0.4, "grad_norm": 0.5660243034362793, "learning_rate": 1.3217833777562393e-07, "loss": -0.0023, "num_tokens": 430906164.0, "reward": 0.8408854246139527, "reward_std": 0.0947806142270565, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8408854246139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.29198580980300903, "sampling/importance_sampling_ratio/max": 1.9845717906951905, "sampling/importance_sampling_ratio/mean": 1.0000143647193909, "sampling/importance_sampling_ratio/min": 0.36637015342712403, "sampling/sampling_logp_difference/max": 1.0281299352645874, "sampling/sampling_logp_difference/mean": 0.014127112366259098, "step": 3165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1809.8, "completions/max_terminated_length": 1809.8, "completions/mean_length": 1347.85625, "completions/mean_terminated_length": 1347.85625, "completions/min_length": 1005.0, "completions/min_terminated_length": 1005.0, "entropy": 0.2895920634269714, "epoch": 3.72502937720329, "frac_reward_zero_std": 0.55, "grad_norm": 0.6063299775123596, "learning_rate": 1.315725708747274e-07, "loss": 0.0046, "num_tokens": 431671334.0, "reward": 0.8238020896911621, "reward_std": 0.06915819272398949, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8238020896911621, "rewards/e2e_recall_precision_mixed_reward/std": 0.21795693933963775, "sampling/importance_sampling_ratio/max": 1.9383326530456544, "sampling/importance_sampling_ratio/mean": 0.9999857902526855, "sampling/importance_sampling_ratio/min": 0.31745859086513517, "sampling/sampling_logp_difference/max": 1.318444514274597, "sampling/sampling_logp_difference/mean": 0.014070061966776848, "step": 3170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1714.2, "completions/max_terminated_length": 1714.2, "completions/mean_length": 1256.196875, "completions/mean_terminated_length": 1256.196875, "completions/min_length": 964.8, "completions/min_terminated_length": 964.8, "entropy": 0.2780440628528595, "epoch": 3.7309048178613398, "frac_reward_zero_std": 0.65, "grad_norm": 0.4056414067745209, "learning_rate": 1.3096680397383085e-07, "loss": -0.0001, "num_tokens": 432385749.0, "reward": 0.903125, "reward_std": 0.06527099013328552, "rewards/e2e_recall_precision_mixed_reward/mean": 0.903125, "rewards/e2e_recall_precision_mixed_reward/std": 0.1702724814414978, "sampling/importance_sampling_ratio/max": 1.9694818019866944, "sampling/importance_sampling_ratio/mean": 0.9998815774917602, "sampling/importance_sampling_ratio/min": 0.30392550230026244, "sampling/sampling_logp_difference/max": 1.3078163862228394, "sampling/sampling_logp_difference/mean": 0.013784093409776687, "step": 3175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1895.6, "completions/max_terminated_length": 1895.6, "completions/mean_length": 1292.0875, "completions/mean_terminated_length": 1292.0875, "completions/min_length": 922.4, "completions/min_terminated_length": 922.4, "entropy": 0.2698298662900925, "epoch": 3.736780258519389, "frac_reward_zero_std": 0.55, "grad_norm": 0.5389244556427002, "learning_rate": 1.3036103707293434e-07, "loss": -0.0008, "num_tokens": 433123985.0, "reward": 0.8802083611488343, "reward_std": 0.09341669231653213, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8802083611488343, "rewards/e2e_recall_precision_mixed_reward/std": 0.23054814338684082, "sampling/importance_sampling_ratio/max": 1.9669621229171752, "sampling/importance_sampling_ratio/mean": 1.0000298500061036, "sampling/importance_sampling_ratio/min": 0.3251545369625092, "sampling/sampling_logp_difference/max": 1.1636561393737792, "sampling/sampling_logp_difference/mean": 0.0135076355189085, "step": 3180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1696.8, "completions/max_terminated_length": 1696.8, "completions/mean_length": 1231.425, "completions/mean_terminated_length": 1231.425, "completions/min_length": 858.2, "completions/min_terminated_length": 858.2, "entropy": 0.28715863823890686, "epoch": 3.7426556991774382, "frac_reward_zero_std": 0.75, "grad_norm": 0.0, "learning_rate": 1.297552701720378e-07, "loss": -0.0024, "num_tokens": 433831369.0, "reward": 0.8338541746139526, "reward_std": 0.05719553902745247, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8338541746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.17622071206569673, "sampling/importance_sampling_ratio/max": 1.9232003927230834, "sampling/importance_sampling_ratio/mean": 0.9998136281967163, "sampling/importance_sampling_ratio/min": 0.38944405019283296, "sampling/sampling_logp_difference/max": 1.0467731714248658, "sampling/sampling_logp_difference/mean": 0.014088386856019496, "step": 3185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1944.8, "completions/max_terminated_length": 1944.8, "completions/mean_length": 1343.6875, "completions/mean_terminated_length": 1343.6875, "completions/min_length": 998.2, "completions/min_terminated_length": 998.2, "entropy": 0.28584455847740176, "epoch": 3.748531139835488, "frac_reward_zero_std": 0.65, "grad_norm": 0.7371358871459961, "learning_rate": 1.2914950327114127e-07, "loss": 0.0072, "num_tokens": 434593301.0, "reward": 0.7756250143051148, "reward_std": 0.0602168183773756, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7756250143051148, "rewards/e2e_recall_precision_mixed_reward/std": 0.29703550338745116, "sampling/importance_sampling_ratio/max": 1.9905659914016725, "sampling/importance_sampling_ratio/mean": 1.0000561237335206, "sampling/importance_sampling_ratio/min": 0.382511293888092, "sampling/sampling_logp_difference/max": 1.0009687900543214, "sampling/sampling_logp_difference/mean": 0.01421151626855135, "step": 3190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1796.8, "completions/max_terminated_length": 1796.8, "completions/mean_length": 1256.571875, "completions/mean_terminated_length": 1256.571875, "completions/min_length": 887.8, "completions/min_terminated_length": 887.8, "entropy": 0.2753683507442474, "epoch": 3.754406580493537, "frac_reward_zero_std": 0.65, "grad_norm": 0.7251403331756592, "learning_rate": 1.2854373637024473e-07, "loss": -0.0012, "num_tokens": 435349260.0, "reward": 0.9098958492279052, "reward_std": 0.07237763702869415, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9098958492279052, "rewards/e2e_recall_precision_mixed_reward/std": 0.19696723371744157, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999914765357971, "sampling/importance_sampling_ratio/min": 0.30798264741897585, "sampling/sampling_logp_difference/max": 1.2155251502990723, "sampling/sampling_logp_difference/mean": 0.013821718096733094, "step": 3195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1932.0, "completions/max_terminated_length": 1932.0, "completions/mean_length": 1318.975, "completions/mean_terminated_length": 1318.975, "completions/min_length": 970.8, "completions/min_terminated_length": 970.8, "entropy": 0.2900474309921265, "epoch": 3.7602820211515864, "frac_reward_zero_std": 0.45, "grad_norm": 0.6051058769226074, "learning_rate": 1.2793796946934817e-07, "loss": 0.009, "num_tokens": 436108212.0, "reward": 0.7828125, "reward_std": 0.10932088047266006, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7828125, "rewards/e2e_recall_precision_mixed_reward/std": 0.28247700333595277, "sampling/importance_sampling_ratio/max": 1.9975552320480348, "sampling/importance_sampling_ratio/mean": 0.9999950647354126, "sampling/importance_sampling_ratio/min": 0.34100759625434873, "sampling/sampling_logp_difference/max": 1.166985023021698, "sampling/sampling_logp_difference/mean": 0.014263258688151837, "step": 3200 }, { "epoch": 3.7602820211515864, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1706.72, "eval_completions/max_terminated_length": 1706.72, "eval_completions/mean_length": 1246.888125, "eval_completions/mean_terminated_length": 1246.888125, "eval_completions/min_length": 955.2, "eval_completions/min_terminated_length": 955.2, "eval_entropy": 0.2836748969554901, "eval_frac_reward_zero_std": 0.58, "eval_loss": 0.0020352269057184458, "eval_num_tokens": 436108212.0, "eval_reward": 0.7717708504199982, "eval_reward_std": 0.08081570498645306, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7717708504199982, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.29261682510375975, "eval_runtime": 476.9187, "eval_samples_per_second": 0.21, "eval_sampling/importance_sampling_ratio/max": 1.9656561374664308, "eval_sampling/importance_sampling_ratio/mean": 0.9999639821052552, "eval_sampling/importance_sampling_ratio/min": 0.30387570122024044, "eval_sampling/sampling_logp_difference/max": 1.6534623003005982, "eval_sampling/sampling_logp_difference/mean": 0.01408041562885046, "eval_steps_per_second": 0.004, "step": 3200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1647.4, "completions/max_terminated_length": 1647.4, "completions/mean_length": 1226.03125, "completions/mean_terminated_length": 1226.03125, "completions/min_length": 913.4, "completions/min_terminated_length": 913.4, "entropy": 0.2630267202854156, "epoch": 3.7661574618096356, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "learning_rate": 1.2733220256845166e-07, "loss": 0.0038, "num_tokens": 436822078.0, "reward": 0.9191145896911621, "reward_std": 0.024364107847213747, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9191145896911621, "rewards/e2e_recall_precision_mixed_reward/std": 0.1650282010436058, "sampling/importance_sampling_ratio/max": 1.9180307149887086, "sampling/importance_sampling_ratio/mean": 0.9998865008354187, "sampling/importance_sampling_ratio/min": 0.3275682792067528, "sampling/sampling_logp_difference/max": 1.269556188583374, "sampling/sampling_logp_difference/mean": 0.013227949663996697, "step": 3205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1672.2, "completions/max_terminated_length": 1672.2, "completions/mean_length": 1261.959375, "completions/mean_terminated_length": 1261.959375, "completions/min_length": 888.4, "completions/min_terminated_length": 888.4, "entropy": 0.2816715180873871, "epoch": 3.772032902467685, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "learning_rate": 1.2672643566755512e-07, "loss": -0.0017, "num_tokens": 437541409.0, "reward": 0.8677083492279053, "reward_std": 0.031082433462142945, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8677083492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.24662085697054864, "sampling/importance_sampling_ratio/max": 1.9304154872894288, "sampling/importance_sampling_ratio/mean": 0.9999831080436706, "sampling/importance_sampling_ratio/min": 0.4297758400440216, "sampling/sampling_logp_difference/max": 0.8534170627593994, "sampling/sampling_logp_difference/mean": 0.014014366827905179, "step": 3210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1905.6, "completions/max_terminated_length": 1905.6, "completions/mean_length": 1288.709375, "completions/mean_terminated_length": 1288.709375, "completions/min_length": 924.4, "completions/min_terminated_length": 924.4, "entropy": 0.29503530263900757, "epoch": 3.7779083431257345, "frac_reward_zero_std": 0.7, "grad_norm": 0.3730573058128357, "learning_rate": 1.2612066876665858e-07, "loss": -0.0028, "num_tokens": 438272292.0, "reward": 0.94453125, "reward_std": 0.06546878516674041, "rewards/e2e_recall_precision_mixed_reward/mean": 0.94453125, "rewards/e2e_recall_precision_mixed_reward/std": 0.1374159798026085, "sampling/importance_sampling_ratio/max": 1.9494167804718017, "sampling/importance_sampling_ratio/mean": 1.0000098466873169, "sampling/importance_sampling_ratio/min": 0.3806808590888977, "sampling/sampling_logp_difference/max": 1.0228654861450195, "sampling/sampling_logp_difference/mean": 0.014348461478948592, "step": 3215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1635.2, "completions/max_terminated_length": 1635.2, "completions/mean_length": 1204.7875, "completions/mean_terminated_length": 1204.7875, "completions/min_length": 843.6, "completions/min_terminated_length": 843.6, "entropy": 0.2663033068180084, "epoch": 3.7837837837837838, "frac_reward_zero_std": 0.75, "grad_norm": 0.0, "learning_rate": 1.2551490186576204e-07, "loss": 0.0036, "num_tokens": 438955920.0, "reward": 0.8723958492279053, "reward_std": 0.06076589897274971, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8723958492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.2042554274201393, "sampling/importance_sampling_ratio/max": 1.8793493509292603, "sampling/importance_sampling_ratio/mean": 1.0000537037849426, "sampling/importance_sampling_ratio/min": 0.37652627825737, "sampling/sampling_logp_difference/max": 1.1007208824157715, "sampling/sampling_logp_difference/mean": 0.013395345583558083, "step": 3220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1841.0, "completions/max_terminated_length": 1841.0, "completions/mean_length": 1314.41875, "completions/mean_terminated_length": 1314.41875, "completions/min_length": 998.6, "completions/min_terminated_length": 998.6, "entropy": 0.29712930917739866, "epoch": 3.789659224441833, "frac_reward_zero_std": 0.55, "grad_norm": 0.5394260287284851, "learning_rate": 1.249091349648655e-07, "loss": 0.0071, "num_tokens": 439713382.0, "reward": 0.8750000119209289, "reward_std": 0.08217244297266006, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8750000119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.21217068284749985, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999483704566956, "sampling/importance_sampling_ratio/min": 0.3432030320167542, "sampling/sampling_logp_difference/max": 1.1881095886230468, "sampling/sampling_logp_difference/mean": 0.014405792579054832, "step": 3225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1802.2, "completions/max_terminated_length": 1802.2, "completions/mean_length": 1308.49375, "completions/mean_terminated_length": 1308.49375, "completions/min_length": 957.8, "completions/min_terminated_length": 957.8, "entropy": 0.2827461302280426, "epoch": 3.7955346650998827, "frac_reward_zero_std": 0.5, "grad_norm": 0.6620894074440002, "learning_rate": 1.2430336806396897e-07, "loss": -0.0002, "num_tokens": 440436756.0, "reward": 0.7843750238418579, "reward_std": 0.07676660120487214, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7843750238418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.28354223668575285, "sampling/importance_sampling_ratio/max": 1.9482336044311523, "sampling/importance_sampling_ratio/mean": 1.0000541090965271, "sampling/importance_sampling_ratio/min": 0.23740711510181428, "sampling/sampling_logp_difference/max": 1.610867190361023, "sampling/sampling_logp_difference/mean": 0.013959074392914772, "step": 3230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1830.4, "completions/max_terminated_length": 1830.4, "completions/mean_length": 1286.35, "completions/mean_terminated_length": 1286.35, "completions/min_length": 939.2, "completions/min_terminated_length": 939.2, "entropy": 0.2779347479343414, "epoch": 3.801410105757932, "frac_reward_zero_std": 0.45, "grad_norm": 0.7190999388694763, "learning_rate": 1.2369760116307243e-07, "loss": 0.0003, "num_tokens": 441181252.0, "reward": 0.8755208373069763, "reward_std": 0.10408329591155052, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8755208373069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.21340711712837218, "sampling/importance_sampling_ratio/max": 1.9635958909988402, "sampling/importance_sampling_ratio/mean": 1.0000646114349365, "sampling/importance_sampling_ratio/min": 0.26795525550842286, "sampling/sampling_logp_difference/max": 1.4363911390304565, "sampling/sampling_logp_difference/mean": 0.014074122533202171, "step": 3235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1587.4, "completions/max_terminated_length": 1587.4, "completions/mean_length": 1182.74375, "completions/mean_terminated_length": 1182.74375, "completions/min_length": 876.8, "completions/min_terminated_length": 876.8, "entropy": 0.27898582220077517, "epoch": 3.807285546415981, "frac_reward_zero_std": 0.6, "grad_norm": 0.4463542401790619, "learning_rate": 1.2309183426217592e-07, "loss": 0.0065, "num_tokens": 441885602.0, "reward": 0.861718761920929, "reward_std": 0.0897410586476326, "rewards/e2e_recall_precision_mixed_reward/mean": 0.861718761920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.25226952880620956, "sampling/importance_sampling_ratio/max": 1.9250823497772216, "sampling/importance_sampling_ratio/mean": 0.9998960614204406, "sampling/importance_sampling_ratio/min": 0.3237256646156311, "sampling/sampling_logp_difference/max": 1.26419837474823, "sampling/sampling_logp_difference/mean": 0.014087118953466416, "step": 3240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1780.0, "completions/max_terminated_length": 1780.0, "completions/mean_length": 1281.84375, "completions/mean_terminated_length": 1281.84375, "completions/min_length": 966.4, "completions/min_terminated_length": 966.4, "entropy": 0.2782359480857849, "epoch": 3.8131609870740304, "frac_reward_zero_std": 0.7, "grad_norm": 0.621003270149231, "learning_rate": 1.2248606736127939e-07, "loss": -0.0019, "num_tokens": 442625328.0, "reward": 0.9307291746139527, "reward_std": 0.0730149507522583, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9307291865348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.1572045348584652, "sampling/importance_sampling_ratio/max": 1.9717673301696776, "sampling/importance_sampling_ratio/mean": 0.9999985694885254, "sampling/importance_sampling_ratio/min": 0.28547490313649176, "sampling/sampling_logp_difference/max": 1.4838282227516175, "sampling/sampling_logp_difference/mean": 0.01399837527424097, "step": 3245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1660.6, "completions/max_terminated_length": 1660.6, "completions/mean_length": 1281.51875, "completions/mean_terminated_length": 1281.51875, "completions/min_length": 981.8, "completions/min_terminated_length": 981.8, "entropy": 0.28906986117362976, "epoch": 3.8190364277320796, "frac_reward_zero_std": 0.55, "grad_norm": 0.572178304195404, "learning_rate": 1.2188030046038282e-07, "loss": -0.0023, "num_tokens": 443347766.0, "reward": 0.8393229246139526, "reward_std": 0.07080771774053574, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8393229246139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.23748641908168794, "sampling/importance_sampling_ratio/max": 1.9597267866134644, "sampling/importance_sampling_ratio/mean": 0.9999413132667542, "sampling/importance_sampling_ratio/min": 0.2964545637369156, "sampling/sampling_logp_difference/max": 1.229891586303711, "sampling/sampling_logp_difference/mean": 0.014197415299713611, "step": 3250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1707.8, "completions/max_terminated_length": 1707.8, "completions/mean_length": 1252.590625, "completions/mean_terminated_length": 1252.590625, "completions/min_length": 884.6, "completions/min_terminated_length": 884.6, "entropy": 0.2784834265708923, "epoch": 3.8249118683901293, "frac_reward_zero_std": 0.6, "grad_norm": 0.39195069670677185, "learning_rate": 1.212745335594863e-07, "loss": 0.0071, "num_tokens": 444042115.0, "reward": 0.8630208492279052, "reward_std": 0.06553929708898068, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8630208492279052, "rewards/e2e_recall_precision_mixed_reward/std": 0.2423491060733795, "sampling/importance_sampling_ratio/max": 1.9346032381057738, "sampling/importance_sampling_ratio/mean": 1.0000757694244384, "sampling/importance_sampling_ratio/min": 0.31034799516201017, "sampling/sampling_logp_difference/max": 1.2329391956329345, "sampling/sampling_logp_difference/mean": 0.013708932884037494, "step": 3255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1977.4, "completions/max_terminated_length": 1880.0, "completions/mean_length": 1329.4125, "completions/mean_terminated_length": 1325.8530029296876, "completions/min_length": 1028.4, "completions/min_terminated_length": 1028.4, "entropy": 0.2909530997276306, "epoch": 3.8307873090481785, "frac_reward_zero_std": 0.6, "grad_norm": 0.4510413408279419, "learning_rate": 1.2066876665858977e-07, "loss": -0.0081, "num_tokens": 444796419.0, "reward": 0.8528645873069763, "reward_std": 0.0627675049006939, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8528645992279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.26745030134916303, "sampling/importance_sampling_ratio/max": 1.9303704738616942, "sampling/importance_sampling_ratio/mean": 1.0000205278396606, "sampling/importance_sampling_ratio/min": 0.27986125648299554, "sampling/sampling_logp_difference/max": 5.938221645355225, "sampling/sampling_logp_difference/mean": 0.01452437173575163, "step": 3260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1806.4, "completions/max_terminated_length": 1806.4, "completions/mean_length": 1308.70625, "completions/mean_terminated_length": 1308.70625, "completions/min_length": 935.0, "completions/min_terminated_length": 935.0, "entropy": 0.2953240931034088, "epoch": 3.8366627497062282, "frac_reward_zero_std": 0.7, "grad_norm": 0.0, "learning_rate": 1.2006299975769324e-07, "loss": -0.0009, "num_tokens": 445569973.0, "reward": 0.8123437643051148, "reward_std": 0.05132426992058754, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8123437643051148, "rewards/e2e_recall_precision_mixed_reward/std": 0.26707516610622406, "sampling/importance_sampling_ratio/max": 1.9395072221755982, "sampling/importance_sampling_ratio/mean": 1.0000399947166443, "sampling/importance_sampling_ratio/min": 0.33533908128738404, "sampling/sampling_logp_difference/max": 1.1794774770736693, "sampling/sampling_logp_difference/mean": 0.014524004608392715, "step": 3265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1771.0, "completions/max_terminated_length": 1771.0, "completions/mean_length": 1274.678125, "completions/mean_terminated_length": 1274.678125, "completions/min_length": 869.8, "completions/min_terminated_length": 869.8, "entropy": 0.2809648633003235, "epoch": 3.8425381903642775, "frac_reward_zero_std": 0.75, "grad_norm": 0.36577996611595154, "learning_rate": 1.194572328567967e-07, "loss": 0.0009, "num_tokens": 446295278.0, "reward": 0.9208333373069764, "reward_std": 0.04342363029718399, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9208333373069764, "rewards/e2e_recall_precision_mixed_reward/std": 0.1181575708091259, "sampling/importance_sampling_ratio/max": 1.9795986413955688, "sampling/importance_sampling_ratio/mean": 1.0001219749450683, "sampling/importance_sampling_ratio/min": 0.31653355807065964, "sampling/sampling_logp_difference/max": 1.2452943086624146, "sampling/sampling_logp_difference/mean": 0.013978814147412777, "step": 3270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1794.0, "completions/max_terminated_length": 1794.0, "completions/mean_length": 1265.828125, "completions/mean_terminated_length": 1265.828125, "completions/min_length": 865.8, "completions/min_terminated_length": 865.8, "entropy": 0.28681405186653136, "epoch": 3.8484136310223267, "frac_reward_zero_std": 0.5, "grad_norm": 0.7446997165679932, "learning_rate": 1.1885146595590016e-07, "loss": 0.0069, "num_tokens": 447073655.0, "reward": 0.8171354174613953, "reward_std": 0.09392708986997604, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8171354174613953, "rewards/e2e_recall_precision_mixed_reward/std": 0.27176968157291415, "sampling/importance_sampling_ratio/max": 1.941535973548889, "sampling/importance_sampling_ratio/mean": 1.000026822090149, "sampling/importance_sampling_ratio/min": 0.37527463138103484, "sampling/sampling_logp_difference/max": 1.0598471283912658, "sampling/sampling_logp_difference/mean": 0.014365506730973721, "step": 3275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1762.8, "completions/max_terminated_length": 1762.8, "completions/mean_length": 1243.915625, "completions/mean_terminated_length": 1243.915625, "completions/min_length": 941.4, "completions/min_terminated_length": 941.4, "entropy": 0.26819110810756686, "epoch": 3.854289071680376, "frac_reward_zero_std": 0.65, "grad_norm": 0.6066403985023499, "learning_rate": 1.1824569905500363e-07, "loss": -0.0064, "num_tokens": 447775020.0, "reward": 0.9101562619209289, "reward_std": 0.07682659178972244, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9101562619209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.1867051303386688, "sampling/importance_sampling_ratio/max": 1.9701864957809447, "sampling/importance_sampling_ratio/mean": 1.00002464056015, "sampling/importance_sampling_ratio/min": 0.3514017522335052, "sampling/sampling_logp_difference/max": 1.1939729452133179, "sampling/sampling_logp_difference/mean": 0.013550216145813466, "step": 3280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1740.2, "completions/max_terminated_length": 1740.2, "completions/mean_length": 1230.046875, "completions/mean_terminated_length": 1230.046875, "completions/min_length": 923.4, "completions/min_terminated_length": 923.4, "entropy": 0.27420614361763, "epoch": 3.860164512338425, "frac_reward_zero_std": 0.7, "grad_norm": 0.4645025432109833, "learning_rate": 1.176399321541071e-07, "loss": -0.0008, "num_tokens": 448493579.0, "reward": 0.7045312643051147, "reward_std": 0.05252151843160391, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7045312643051147, "rewards/e2e_recall_precision_mixed_reward/std": 0.3159709542989731, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999969244003296, "sampling/importance_sampling_ratio/min": 0.31865512803196905, "sampling/sampling_logp_difference/max": 1.5703006744384767, "sampling/sampling_logp_difference/mean": 0.013808564841747284, "step": 3285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1857.2, "completions/max_terminated_length": 1857.2, "completions/mean_length": 1297.978125, "completions/mean_terminated_length": 1297.978125, "completions/min_length": 910.6, "completions/min_terminated_length": 910.6, "entropy": 0.28309070467948916, "epoch": 3.866039952996475, "frac_reward_zero_std": 0.5, "grad_norm": 0.47286638617515564, "learning_rate": 1.1703416525321055e-07, "loss": -0.009, "num_tokens": 449245108.0, "reward": 0.8286458492279053, "reward_std": 0.09949100911617278, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8286458492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.21464731693267822, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000564217567445, "sampling/importance_sampling_ratio/min": 0.40592106580734255, "sampling/sampling_logp_difference/max": 0.982337212562561, "sampling/sampling_logp_difference/mean": 0.01408249158412218, "step": 3290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1622.0, "completions/max_terminated_length": 1622.0, "completions/mean_length": 1222.871875, "completions/mean_terminated_length": 1222.871875, "completions/min_length": 878.2, "completions/min_terminated_length": 878.2, "entropy": 0.28852399289608, "epoch": 3.871915393654524, "frac_reward_zero_std": 0.7, "grad_norm": 0.5962092280387878, "learning_rate": 1.1642839835231403e-07, "loss": 0.005, "num_tokens": 449982747.0, "reward": 0.7946875214576721, "reward_std": 0.05859446972608566, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7946875333786011, "rewards/e2e_recall_precision_mixed_reward/std": 0.2911753743886948, "sampling/importance_sampling_ratio/max": 1.9743703126907348, "sampling/importance_sampling_ratio/mean": 1.000085210800171, "sampling/importance_sampling_ratio/min": 0.3090327255427837, "sampling/sampling_logp_difference/max": 1.462190842628479, "sampling/sampling_logp_difference/mean": 0.014431641064584255, "step": 3295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1854.8, "completions/max_terminated_length": 1854.8, "completions/mean_length": 1340.121875, "completions/mean_terminated_length": 1340.121875, "completions/min_length": 984.0, "completions/min_terminated_length": 984.0, "entropy": 0.27524838745594027, "epoch": 3.8777908343125733, "frac_reward_zero_std": 0.6, "grad_norm": 0.488313227891922, "learning_rate": 1.1582263145141749e-07, "loss": 0.001, "num_tokens": 450719506.0, "reward": 0.83671875, "reward_std": 0.07024868726730346, "rewards/e2e_recall_precision_mixed_reward/mean": 0.836718761920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.24404936730861665, "sampling/importance_sampling_ratio/max": 1.9955852031707764, "sampling/importance_sampling_ratio/mean": 0.9999388694763184, "sampling/importance_sampling_ratio/min": 0.33213537335395815, "sampling/sampling_logp_difference/max": 1.3013247728347779, "sampling/sampling_logp_difference/mean": 0.01370444092899561, "step": 3300 }, { "epoch": 3.8777908343125733, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1689.44, "eval_completions/max_terminated_length": 1689.44, "eval_completions/mean_length": 1224.116875, "eval_completions/mean_terminated_length": 1224.116875, "eval_completions/min_length": 919.28, "eval_completions/min_terminated_length": 919.28, "eval_entropy": 0.2747154176235199, "eval_frac_reward_zero_std": 0.63, "eval_loss": 0.0016999093350023031, "eval_num_tokens": 450719506.0, "eval_reward": 0.77307293176651, "eval_reward_std": 0.07358525022864341, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.77307293176651, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.2892902755737305, "eval_runtime": 460.3218, "eval_samples_per_second": 0.217, "eval_sampling/importance_sampling_ratio/max": 1.9607455730438232, "eval_sampling/importance_sampling_ratio/mean": 0.9999883246421813, "eval_sampling/importance_sampling_ratio/min": 0.3240952134691179, "eval_sampling/sampling_logp_difference/max": 1.5946112561225891, "eval_sampling/sampling_logp_difference/mean": 0.013883443474769592, "eval_steps_per_second": 0.004, "step": 3300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1860.4, "completions/max_terminated_length": 1860.4, "completions/mean_length": 1337.828125, "completions/mean_terminated_length": 1337.828125, "completions/min_length": 1008.6, "completions/min_terminated_length": 1008.6, "entropy": 0.274376255273819, "epoch": 3.883666274970623, "frac_reward_zero_std": 0.7, "grad_norm": 0.4430628716945648, "learning_rate": 1.1521686455052095e-07, "loss": 0.0013, "num_tokens": 451469419.0, "reward": 0.8466145992279053, "reward_std": 0.051951204985380174, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8466145992279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.2435604065656662, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000598907470704, "sampling/importance_sampling_ratio/min": 0.2840733528137207, "sampling/sampling_logp_difference/max": 1.5291481018066406, "sampling/sampling_logp_difference/mean": 0.013875341042876244, "step": 3305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1777.4, "completions/max_terminated_length": 1777.4, "completions/mean_length": 1238.178125, "completions/mean_terminated_length": 1238.178125, "completions/min_length": 889.6, "completions/min_terminated_length": 889.6, "entropy": 0.2748557984828949, "epoch": 3.8895417156286722, "frac_reward_zero_std": 0.7, "grad_norm": 0.46624255180358887, "learning_rate": 1.1461109764962442e-07, "loss": 0.0037, "num_tokens": 452191012.0, "reward": 0.9239583492279053, "reward_std": 0.06863614469766617, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9239583492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.15527809858322145, "sampling/importance_sampling_ratio/max": 1.9529468297958374, "sampling/importance_sampling_ratio/mean": 0.9999673128128052, "sampling/importance_sampling_ratio/min": 0.35668731927871705, "sampling/sampling_logp_difference/max": 1.2592904806137084, "sampling/sampling_logp_difference/mean": 0.013761545717716216, "step": 3310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1729.8, "completions/max_terminated_length": 1729.8, "completions/mean_length": 1261.10625, "completions/mean_terminated_length": 1261.10625, "completions/min_length": 979.8, "completions/min_terminated_length": 979.8, "entropy": 0.27796257734298707, "epoch": 3.8954171562867215, "frac_reward_zero_std": 0.65, "grad_norm": 0.4277585446834564, "learning_rate": 1.1400533074872788e-07, "loss": 0.0018, "num_tokens": 452904374.0, "reward": 0.909375011920929, "reward_std": 0.062048446759581564, "rewards/e2e_recall_precision_mixed_reward/mean": 0.909375011920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.17083195745944976, "sampling/importance_sampling_ratio/max": 1.863189125061035, "sampling/importance_sampling_ratio/mean": 0.9999260902404785, "sampling/importance_sampling_ratio/min": 0.3553848028182983, "sampling/sampling_logp_difference/max": 1.1737365007400513, "sampling/sampling_logp_difference/mean": 0.013707248121500015, "step": 3315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1736.8, "completions/max_terminated_length": 1736.8, "completions/mean_length": 1264.659375, "completions/mean_terminated_length": 1264.659375, "completions/min_length": 948.2, "completions/min_terminated_length": 948.2, "entropy": 0.27279492020606994, "epoch": 3.9012925969447707, "frac_reward_zero_std": 0.6, "grad_norm": 0.650298535823822, "learning_rate": 1.1339956384783135e-07, "loss": -0.0041, "num_tokens": 453642489.0, "reward": 0.846875011920929, "reward_std": 0.0856197141110897, "rewards/e2e_recall_precision_mixed_reward/mean": 0.846875011920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.25374895632266997, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000032126903534, "sampling/importance_sampling_ratio/min": 0.3886173486709595, "sampling/sampling_logp_difference/max": 1.1749614000320434, "sampling/sampling_logp_difference/mean": 0.01352920550853014, "step": 3320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1809.2, "completions/max_terminated_length": 1753.6, "completions/mean_length": 1218.434375, "completions/mean_terminated_length": 1215.080126953125, "completions/min_length": 850.4, "completions/min_terminated_length": 850.4, "entropy": 0.25508340895175935, "epoch": 3.90716803760282, "frac_reward_zero_std": 0.85, "grad_norm": 0.0, "learning_rate": 1.1279379694693482e-07, "loss": -0.0036, "num_tokens": 454369584.0, "reward": 0.8700520873069764, "reward_std": 0.02701122909784317, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8700520873069764, "rewards/e2e_recall_precision_mixed_reward/std": 0.20110743045806884, "sampling/importance_sampling_ratio/max": 1.851835560798645, "sampling/importance_sampling_ratio/mean": 1.000059926509857, "sampling/importance_sampling_ratio/min": 0.34435550272464754, "sampling/sampling_logp_difference/max": 1.1637211084365844, "sampling/sampling_logp_difference/mean": 0.013143818266689778, "step": 3325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1748.0, "completions/max_terminated_length": 1748.0, "completions/mean_length": 1249.828125, "completions/mean_terminated_length": 1249.828125, "completions/min_length": 952.8, "completions/min_terminated_length": 952.8, "entropy": 0.2764308452606201, "epoch": 3.9130434782608696, "frac_reward_zero_std": 0.6, "grad_norm": 0.49342775344848633, "learning_rate": 1.1218803004603827e-07, "loss": 0.0013, "num_tokens": 455124777.0, "reward": 0.8270833492279053, "reward_std": 0.06940719485282898, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8270833492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.3089472085237503, "sampling/importance_sampling_ratio/max": 1.8889091730117797, "sampling/importance_sampling_ratio/mean": 0.9999609589576721, "sampling/importance_sampling_ratio/min": 0.3440424233675003, "sampling/sampling_logp_difference/max": 1.1790516376495361, "sampling/sampling_logp_difference/mean": 0.01391413640230894, "step": 3330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1740.6, "completions/max_terminated_length": 1740.6, "completions/mean_length": 1237.1875, "completions/mean_terminated_length": 1237.1875, "completions/min_length": 922.6, "completions/min_terminated_length": 922.6, "entropy": 0.2558923900127411, "epoch": 3.918918918918919, "frac_reward_zero_std": 0.8, "grad_norm": 0.6352602243423462, "learning_rate": 1.1158226314514174e-07, "loss": -0.0015, "num_tokens": 455869669.0, "reward": 0.770312511920929, "reward_std": 0.03016253113746643, "rewards/e2e_recall_precision_mixed_reward/mean": 0.770312511920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.2921178758144379, "sampling/importance_sampling_ratio/max": 1.9723082304000854, "sampling/importance_sampling_ratio/mean": 1.0000006914138795, "sampling/importance_sampling_ratio/min": 0.3538441300392151, "sampling/sampling_logp_difference/max": 1.0752784729003906, "sampling/sampling_logp_difference/mean": 0.01325883362442255, "step": 3335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1676.0, "completions/max_terminated_length": 1676.0, "completions/mean_length": 1213.696875, "completions/mean_terminated_length": 1213.696875, "completions/min_length": 882.2, "completions/min_terminated_length": 882.2, "entropy": 0.2639861524105072, "epoch": 3.9247943595769685, "frac_reward_zero_std": 0.7, "grad_norm": 0.0, "learning_rate": 1.109764962442452e-07, "loss": -0.0031, "num_tokens": 456588324.0, "reward": 0.8276041746139526, "reward_std": 0.045714473351836205, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8276041746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.2570072665810585, "sampling/importance_sampling_ratio/max": 1.9386536121368407, "sampling/importance_sampling_ratio/mean": 0.9998268485069275, "sampling/importance_sampling_ratio/min": 0.3666181623935699, "sampling/sampling_logp_difference/max": 1.0288118839263916, "sampling/sampling_logp_difference/mean": 0.013535234890878201, "step": 3340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1798.0, "completions/max_terminated_length": 1796.4, "completions/mean_length": 1286.8375, "completions/mean_terminated_length": 1271.7397705078124, "completions/min_length": 940.4, "completions/min_terminated_length": 940.4, "entropy": 0.2659426271915436, "epoch": 3.9306698002350178, "frac_reward_zero_std": 0.65, "grad_norm": 0.5957963466644287, "learning_rate": 1.1037072934334868e-07, "loss": -0.0059, "num_tokens": 457293020.0, "reward": 0.9078125119209289, "reward_std": 0.057849539816379546, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9078125119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.1975775107741356, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999847173690796, "sampling/importance_sampling_ratio/min": 0.31824939250946044, "sampling/sampling_logp_difference/max": 1.3847343921661377, "sampling/sampling_logp_difference/mean": 0.013584697060286999, "step": 3345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1622.0, "completions/max_terminated_length": 1622.0, "completions/mean_length": 1119.728125, "completions/mean_terminated_length": 1119.728125, "completions/min_length": 785.4, "completions/min_terminated_length": 785.4, "entropy": 0.24143437743186952, "epoch": 3.936545240893067, "frac_reward_zero_std": 0.55, "grad_norm": 0.4451583921909332, "learning_rate": 1.0976496244245213e-07, "loss": 0.004, "num_tokens": 457972389.0, "reward": 0.8882812619209289, "reward_std": 0.07290575057268142, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8882812619209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.19074126631021499, "sampling/importance_sampling_ratio/max": 1.9808499813079834, "sampling/importance_sampling_ratio/mean": 1.0000173211097718, "sampling/importance_sampling_ratio/min": 0.3381476104259491, "sampling/sampling_logp_difference/max": 1.2803590774536133, "sampling/sampling_logp_difference/mean": 0.012810366414487361, "step": 3350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1703.8, "completions/max_terminated_length": 1703.8, "completions/mean_length": 1184.834375, "completions/mean_terminated_length": 1184.834375, "completions/min_length": 811.2, "completions/min_terminated_length": 811.2, "entropy": 0.24822763800621034, "epoch": 3.9424206815511162, "frac_reward_zero_std": 0.55, "grad_norm": 0.5334610939025879, "learning_rate": 1.0915919554155561e-07, "loss": 0.0016, "num_tokens": 458701424.0, "reward": 0.9039583563804626, "reward_std": 0.08115731552243233, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9039583563804626, "rewards/e2e_recall_precision_mixed_reward/std": 0.18836814165115356, "sampling/importance_sampling_ratio/max": 1.980670428276062, "sampling/importance_sampling_ratio/mean": 1.00001859664917, "sampling/importance_sampling_ratio/min": 0.35833509400172686, "sampling/sampling_logp_difference/max": 4.0236934423446655, "sampling/sampling_logp_difference/mean": 0.0130048006772995, "step": 3355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1886.0, "completions/max_terminated_length": 1886.0, "completions/mean_length": 1239.865625, "completions/mean_terminated_length": 1239.865625, "completions/min_length": 899.2, "completions/min_terminated_length": 899.2, "entropy": 0.27396275401115416, "epoch": 3.9482961222091655, "frac_reward_zero_std": 0.45, "grad_norm": 0.7187315821647644, "learning_rate": 1.0855342864065907e-07, "loss": 0.0056, "num_tokens": 459467877.0, "reward": 0.7541666746139526, "reward_std": 0.1147657498717308, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7541666746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.3114150196313858, "sampling/importance_sampling_ratio/max": 1.89167377948761, "sampling/importance_sampling_ratio/mean": 0.9999890923500061, "sampling/importance_sampling_ratio/min": 0.37039981186389925, "sampling/sampling_logp_difference/max": 1.0398942470550536, "sampling/sampling_logp_difference/mean": 0.013965315371751785, "step": 3360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1493.8, "completions/max_terminated_length": 1493.8, "completions/mean_length": 1098.35625, "completions/mean_terminated_length": 1098.35625, "completions/min_length": 846.0, "completions/min_terminated_length": 846.0, "entropy": 0.2534207671880722, "epoch": 3.954171562867215, "frac_reward_zero_std": 0.75, "grad_norm": 0.579468846321106, "learning_rate": 1.0794766173976253e-07, "loss": -0.0038, "num_tokens": 460121783.0, "reward": 0.868541669845581, "reward_std": 0.04859443977475166, "rewards/e2e_recall_precision_mixed_reward/mean": 0.868541669845581, "rewards/e2e_recall_precision_mixed_reward/std": 0.21250081658363343, "sampling/importance_sampling_ratio/max": 1.9952316761016846, "sampling/importance_sampling_ratio/mean": 1.0000939130783082, "sampling/importance_sampling_ratio/min": 0.39403712153434756, "sampling/sampling_logp_difference/max": 1.1500419616699218, "sampling/sampling_logp_difference/mean": 0.01323564574122429, "step": 3365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009375, "completions/max_length": 1879.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 1251.409375, "completions/mean_terminated_length": 1241.074267578125, "completions/min_length": 907.0, "completions/min_terminated_length": 907.0, "entropy": 0.2608177125453949, "epoch": 3.9600470035252644, "frac_reward_zero_std": 0.75, "grad_norm": 0.5231078863143921, "learning_rate": 1.07341894838866e-07, "loss": -0.0226, "num_tokens": 460847966.0, "reward": 0.7572916746139526, "reward_std": 0.060109014809131625, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7572916746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.28681475222110747, "sampling/importance_sampling_ratio/max": 1.9938726425170898, "sampling/importance_sampling_ratio/mean": 1.0000676155090331, "sampling/importance_sampling_ratio/min": 0.39298430681228635, "sampling/sampling_logp_difference/max": 1.071886992454529, "sampling/sampling_logp_difference/mean": 0.013303074613213539, "step": 3370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1426.8, "completions/max_terminated_length": 1426.8, "completions/mean_length": 1150.00625, "completions/mean_terminated_length": 1150.00625, "completions/min_length": 928.8, "completions/min_terminated_length": 928.8, "entropy": 0.2832071840763092, "epoch": 3.9659224441833136, "frac_reward_zero_std": 0.85, "grad_norm": 0.3876747190952301, "learning_rate": 1.0673612793796946e-07, "loss": -0.0016, "num_tokens": 461555024.0, "reward": 0.8479166746139526, "reward_std": 0.02359100729227066, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8479166746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.2031202271580696, "sampling/importance_sampling_ratio/max": 1.9597638845443726, "sampling/importance_sampling_ratio/mean": 1.0000397205352782, "sampling/importance_sampling_ratio/min": 0.4398146092891693, "sampling/sampling_logp_difference/max": 0.9516366958618164, "sampling/sampling_logp_difference/mean": 0.014147293195128441, "step": 3375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1734.8, "completions/max_terminated_length": 1734.8, "completions/mean_length": 1196.45625, "completions/mean_terminated_length": 1196.45625, "completions/min_length": 912.8, "completions/min_terminated_length": 912.8, "entropy": 0.24856521785259247, "epoch": 3.9717978848413633, "frac_reward_zero_std": 0.65, "grad_norm": 0.44269227981567383, "learning_rate": 1.0613036103707294e-07, "loss": 0.0011, "num_tokens": 462241618.0, "reward": 0.7182291626930237, "reward_std": 0.058018694072961806, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7182291626930237, "rewards/e2e_recall_precision_mixed_reward/std": 0.33283857107162473, "sampling/importance_sampling_ratio/max": 1.9259427070617676, "sampling/importance_sampling_ratio/mean": 1.0000762820243836, "sampling/importance_sampling_ratio/min": 0.319977280497551, "sampling/sampling_logp_difference/max": 1.3419868707656861, "sampling/sampling_logp_difference/mean": 0.012762147746980191, "step": 3380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1593.2, "completions/max_terminated_length": 1593.2, "completions/mean_length": 1190.53125, "completions/mean_terminated_length": 1190.53125, "completions/min_length": 852.0, "completions/min_terminated_length": 852.0, "entropy": 0.2616691470146179, "epoch": 3.9776733254994125, "frac_reward_zero_std": 0.8, "grad_norm": 0.4497514069080353, "learning_rate": 1.055245941361764e-07, "loss": 0.003, "num_tokens": 462957900.0, "reward": 0.896875, "reward_std": 0.03596546053886414, "rewards/e2e_recall_precision_mixed_reward/mean": 0.896875, "rewards/e2e_recall_precision_mixed_reward/std": 0.17488451898097992, "sampling/importance_sampling_ratio/max": 1.987834596633911, "sampling/importance_sampling_ratio/mean": 1.0000129342079163, "sampling/importance_sampling_ratio/min": 0.36617528796195986, "sampling/sampling_logp_difference/max": 1.1533316850662232, "sampling/sampling_logp_difference/mean": 0.013550573959946632, "step": 3385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1514.2, "completions/max_terminated_length": 1514.2, "completions/mean_length": 1162.390625, "completions/mean_terminated_length": 1162.390625, "completions/min_length": 885.8, "completions/min_terminated_length": 885.8, "entropy": 0.26850571632385256, "epoch": 3.983548766157462, "frac_reward_zero_std": 0.55, "grad_norm": 0.4390755593776703, "learning_rate": 1.0491882723527986e-07, "loss": 0.0001, "num_tokens": 463636649.0, "reward": 0.7920312643051147, "reward_std": 0.08977707475423813, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7920312643051147, "rewards/e2e_recall_precision_mixed_reward/std": 0.29775896966457366, "sampling/importance_sampling_ratio/max": 1.9499453544616698, "sampling/importance_sampling_ratio/mean": 0.9999655961990357, "sampling/importance_sampling_ratio/min": 0.35879728496074675, "sampling/sampling_logp_difference/max": 1.1991132736206054, "sampling/sampling_logp_difference/mean": 0.013767444901168347, "step": 3390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1806.0, "completions/max_terminated_length": 1806.0, "completions/mean_length": 1238.225, "completions/mean_terminated_length": 1238.225, "completions/min_length": 815.6, "completions/min_terminated_length": 815.6, "entropy": 0.2589147299528122, "epoch": 3.989424206815511, "frac_reward_zero_std": 0.6, "grad_norm": 0.3654157817363739, "learning_rate": 1.0431306033438332e-07, "loss": 0.001, "num_tokens": 464341057.0, "reward": 0.8408854246139527, "reward_std": 0.10250527374446392, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8408854246139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.2893193304538727, "sampling/importance_sampling_ratio/max": 1.986938238143921, "sampling/importance_sampling_ratio/mean": 1.000028955936432, "sampling/importance_sampling_ratio/min": 0.2818640649318695, "sampling/sampling_logp_difference/max": 1.6181657314300537, "sampling/sampling_logp_difference/mean": 0.013167793862521648, "step": 3395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1650.2, "completions/max_terminated_length": 1650.2, "completions/mean_length": 1230.4125, "completions/mean_terminated_length": 1230.4125, "completions/min_length": 968.8, "completions/min_terminated_length": 968.8, "entropy": 0.27528418600559235, "epoch": 3.9952996474735603, "frac_reward_zero_std": 0.7, "grad_norm": 0.3999543786048889, "learning_rate": 1.0370729343348679e-07, "loss": -0.0034, "num_tokens": 465059861.0, "reward": 0.7984375, "reward_std": 0.07827110588550568, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7984375, "rewards/e2e_recall_precision_mixed_reward/std": 0.2598001092672348, "sampling/importance_sampling_ratio/max": 1.9498087167739868, "sampling/importance_sampling_ratio/mean": 1.0000772595405578, "sampling/importance_sampling_ratio/min": 0.27214218527078626, "sampling/sampling_logp_difference/max": 1.4822240352630616, "sampling/sampling_logp_difference/mean": 0.013907233253121376, "step": 3400 }, { "epoch": 3.9952996474735603, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1572.6, "eval_completions/max_terminated_length": 1572.6, "eval_completions/mean_length": 1161.240625, "eval_completions/mean_terminated_length": 1161.240625, "eval_completions/min_length": 880.12, "eval_completions/min_terminated_length": 880.12, "eval_entropy": 0.2626095861196518, "eval_frac_reward_zero_std": 0.6, "eval_loss": 0.0030293413437902927, "eval_num_tokens": 465059861.0, "eval_reward": 0.7789062619209289, "eval_reward_std": 0.07541791707277298, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7789062619209289, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.2894728544354439, "eval_runtime": 437.9033, "eval_samples_per_second": 0.228, "eval_sampling/importance_sampling_ratio/max": 1.937536702156067, "eval_sampling/importance_sampling_ratio/mean": 1.0000023913383485, "eval_sampling/importance_sampling_ratio/min": 0.3022413222497363, "eval_sampling/sampling_logp_difference/max": 2.2783599162101744, "eval_sampling/sampling_logp_difference/mean": 0.01340147852897644, "eval_steps_per_second": 0.005, "step": 3400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1525.2, "completions/max_terminated_length": 1525.2, "completions/mean_length": 1220.359375, "completions/mean_terminated_length": 1220.359375, "completions/min_length": 995.0, "completions/min_terminated_length": 995.0, "entropy": 0.2618350923061371, "epoch": 4.0011750881316095, "frac_reward_zero_std": 0.6, "grad_norm": 0.47610652446746826, "learning_rate": 1.0310152653259026e-07, "loss": 0.0047, "num_tokens": 465757512.0, "reward": 0.8255208373069763, "reward_std": 0.07653152495622635, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8255208373069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.2626977309584618, "sampling/importance_sampling_ratio/max": 1.943035101890564, "sampling/importance_sampling_ratio/mean": 0.9999830961227417, "sampling/importance_sampling_ratio/min": 0.2748304158449173, "sampling/sampling_logp_difference/max": 1.504104995727539, "sampling/sampling_logp_difference/mean": 0.013258552365005016, "step": 3405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1754.2, "completions/max_terminated_length": 1754.2, "completions/mean_length": 1229.028125, "completions/mean_terminated_length": 1229.028125, "completions/min_length": 971.6, "completions/min_terminated_length": 971.6, "entropy": 0.2637974351644516, "epoch": 4.00705052878966, "frac_reward_zero_std": 0.55, "grad_norm": 0.6931338310241699, "learning_rate": 1.0249575963169373e-07, "loss": 0.0016, "num_tokens": 466462977.0, "reward": 0.8789583444595337, "reward_std": 0.09424636662006378, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8789583444595337, "rewards/e2e_recall_precision_mixed_reward/std": 0.1916855752468109, "sampling/importance_sampling_ratio/max": 1.9907166004180907, "sampling/importance_sampling_ratio/mean": 1.000013256072998, "sampling/importance_sampling_ratio/min": 0.25232034027576444, "sampling/sampling_logp_difference/max": 1.4631556510925292, "sampling/sampling_logp_difference/mean": 0.013366755843162537, "step": 3410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1727.6, "completions/max_terminated_length": 1727.6, "completions/mean_length": 1218.85625, "completions/mean_terminated_length": 1218.85625, "completions/min_length": 871.2, "completions/min_terminated_length": 871.2, "entropy": 0.2636944532394409, "epoch": 4.012925969447709, "frac_reward_zero_std": 0.55, "grad_norm": 0.49683284759521484, "learning_rate": 1.0188999273079718e-07, "loss": 0.0018, "num_tokens": 467176147.0, "reward": 0.864062511920929, "reward_std": 0.07282592691481113, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8640625238418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.25493159517645836, "sampling/importance_sampling_ratio/max": 1.9635956287384033, "sampling/importance_sampling_ratio/mean": 0.9999071478843689, "sampling/importance_sampling_ratio/min": 0.31713399589061736, "sampling/sampling_logp_difference/max": 1.2184108018875122, "sampling/sampling_logp_difference/mean": 0.01336588580161333, "step": 3415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1758.2, "completions/max_terminated_length": 1758.2, "completions/mean_length": 1226.559375, "completions/mean_terminated_length": 1226.559375, "completions/min_length": 927.0, "completions/min_terminated_length": 927.0, "entropy": 0.25220133662223815, "epoch": 4.018801410105758, "frac_reward_zero_std": 0.75, "grad_norm": 0.0, "learning_rate": 1.0128422582990065e-07, "loss": -0.0008, "num_tokens": 467866534.0, "reward": 0.9083333373069763, "reward_std": 0.05359421372413635, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9083333373069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.1494651608169079, "sampling/importance_sampling_ratio/max": 1.974375057220459, "sampling/importance_sampling_ratio/mean": 1.0000734686851502, "sampling/importance_sampling_ratio/min": 0.34163759648799896, "sampling/sampling_logp_difference/max": 1.3871023654937744, "sampling/sampling_logp_difference/mean": 0.012934430874884129, "step": 3420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1483.4, "completions/max_terminated_length": 1483.4, "completions/mean_length": 1160.09375, "completions/mean_terminated_length": 1160.09375, "completions/min_length": 892.4, "completions/min_terminated_length": 892.4, "entropy": 0.2803857684135437, "epoch": 4.024676850763807, "frac_reward_zero_std": 0.6, "grad_norm": 0.6446830630302429, "learning_rate": 1.0067845892900411e-07, "loss": -0.0032, "num_tokens": 468585380.0, "reward": 0.8739583492279053, "reward_std": 0.07574607878923416, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8739583492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.1989564597606659, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999167919158936, "sampling/importance_sampling_ratio/min": 0.32423160076141355, "sampling/sampling_logp_difference/max": 1.2724119901657105, "sampling/sampling_logp_difference/mean": 0.01405625492334366, "step": 3425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1626.0, "completions/max_terminated_length": 1626.0, "completions/mean_length": 1228.2875, "completions/mean_terminated_length": 1228.2875, "completions/min_length": 988.2, "completions/min_terminated_length": 988.2, "entropy": 0.265902704000473, "epoch": 4.030552291421857, "frac_reward_zero_std": 0.55, "grad_norm": 0.6040170788764954, "learning_rate": 1.0007269202810759e-07, "loss": -0.0004, "num_tokens": 469277968.0, "reward": 0.8377604365348816, "reward_std": 0.08370122164487839, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8377604365348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.2523147314786911, "sampling/importance_sampling_ratio/max": 1.9924169778823853, "sampling/importance_sampling_ratio/mean": 0.9999254226684571, "sampling/importance_sampling_ratio/min": 0.3520743578672409, "sampling/sampling_logp_difference/max": 1.206534743309021, "sampling/sampling_logp_difference/mean": 0.013386547565460205, "step": 3430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1853.8, "completions/max_terminated_length": 1853.8, "completions/mean_length": 1253.959375, "completions/mean_terminated_length": 1253.959375, "completions/min_length": 908.4, "completions/min_terminated_length": 908.4, "entropy": 0.26015831232070924, "epoch": 4.036427732079906, "frac_reward_zero_std": 0.65, "grad_norm": 0.4293096363544464, "learning_rate": 9.946692512721104e-08, "loss": -0.0044, "num_tokens": 470030387.0, "reward": 0.73828125, "reward_std": 0.06746623069047927, "rewards/e2e_recall_precision_mixed_reward/mean": 0.73828125, "rewards/e2e_recall_precision_mixed_reward/std": 0.3138516306877136, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999528050422668, "sampling/importance_sampling_ratio/min": 0.292254401743412, "sampling/sampling_logp_difference/max": 1.4798493027687072, "sampling/sampling_logp_difference/mean": 0.013409636914730072, "step": 3435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 1659.4, "completions/max_terminated_length": 1593.2, "completions/mean_length": 1152.0875, "completions/mean_terminated_length": 1144.1885986328125, "completions/min_length": 786.0, "completions/min_terminated_length": 786.0, "entropy": 0.2462655335664749, "epoch": 4.042303172737955, "frac_reward_zero_std": 0.6, "grad_norm": 0.9179558157920837, "learning_rate": 9.88611582263145e-08, "loss": -0.0058, "num_tokens": 470680119.0, "reward": 0.9131770968437195, "reward_std": 0.06535822451114655, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9131770968437195, "rewards/e2e_recall_precision_mixed_reward/std": 0.18944206982851028, "sampling/importance_sampling_ratio/max": 1.987571358680725, "sampling/importance_sampling_ratio/mean": 0.9999889135360718, "sampling/importance_sampling_ratio/min": 0.42287402153015136, "sampling/sampling_logp_difference/max": 0.9650676608085632, "sampling/sampling_logp_difference/mean": 0.01276344656944275, "step": 3440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1661.4, "completions/max_terminated_length": 1661.4, "completions/mean_length": 1190.309375, "completions/mean_terminated_length": 1190.309375, "completions/min_length": 872.8, "completions/min_terminated_length": 872.8, "entropy": 0.27079584300518034, "epoch": 4.048178613396004, "frac_reward_zero_std": 0.7, "grad_norm": 0.0, "learning_rate": 9.825539132541798e-08, "loss": 0.0053, "num_tokens": 471372362.0, "reward": 0.8203125119209289, "reward_std": 0.03124999850988388, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8203125119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.2445871613919735, "sampling/importance_sampling_ratio/max": 1.9853496074676513, "sampling/importance_sampling_ratio/mean": 0.9999292612075805, "sampling/importance_sampling_ratio/min": 0.3703574028797448, "sampling/sampling_logp_difference/max": 1.7025861740112305, "sampling/sampling_logp_difference/mean": 0.01366796400398016, "step": 3445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1731.2, "completions/max_terminated_length": 1731.2, "completions/mean_length": 1228.340625, "completions/mean_terminated_length": 1228.340625, "completions/min_length": 884.4, "completions/min_terminated_length": 884.4, "entropy": 0.2743656039237976, "epoch": 4.054054054054054, "frac_reward_zero_std": 0.6, "grad_norm": 0.6055347323417664, "learning_rate": 9.764962442452144e-08, "loss": 0.003, "num_tokens": 472078599.0, "reward": 0.8036458492279053, "reward_std": 0.08734508380293846, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8036458492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.26783247590065, "sampling/importance_sampling_ratio/max": 1.965476965904236, "sampling/importance_sampling_ratio/mean": 1.0000924825668336, "sampling/importance_sampling_ratio/min": 0.3593331933021545, "sampling/sampling_logp_difference/max": 1.0880284786224366, "sampling/sampling_logp_difference/mean": 0.013652561791241169, "step": 3450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1785.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 1229.25, "completions/mean_terminated_length": 1229.25, "completions/min_length": 882.8, "completions/min_terminated_length": 882.8, "entropy": 0.27147723734378815, "epoch": 4.059929494712104, "frac_reward_zero_std": 0.7, "grad_norm": 0.573058009147644, "learning_rate": 9.70438575236249e-08, "loss": 0.0021, "num_tokens": 472810231.0, "reward": 0.9270833373069763, "reward_std": 0.05796501636505127, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9270833373069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.17348659336566924, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001057624816894, "sampling/importance_sampling_ratio/min": 0.3474849671125412, "sampling/sampling_logp_difference/max": 1.1374139070510865, "sampling/sampling_logp_difference/mean": 0.013674916699528694, "step": 3455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1920.2, "completions/max_terminated_length": 1883.6, "completions/mean_length": 1295.409375, "completions/mean_terminated_length": 1291.970556640625, "completions/min_length": 890.2, "completions/min_terminated_length": 890.2, "entropy": 0.26137855648994446, "epoch": 4.065804935370153, "frac_reward_zero_std": 0.7, "grad_norm": 0.5707703232765198, "learning_rate": 9.643809062272837e-08, "loss": -0.0021, "num_tokens": 473540566.0, "reward": 0.8479166865348816, "reward_std": 0.056248662620782854, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8479166865348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.25568803399801254, "sampling/importance_sampling_ratio/max": 1.9878350973129273, "sampling/importance_sampling_ratio/mean": 0.9999929904937744, "sampling/importance_sampling_ratio/min": 0.35799447596073153, "sampling/sampling_logp_difference/max": 1.1041216850280762, "sampling/sampling_logp_difference/mean": 0.013470960408449173, "step": 3460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1668.8, "completions/max_terminated_length": 1668.8, "completions/mean_length": 1234.76875, "completions/mean_terminated_length": 1234.76875, "completions/min_length": 984.6, "completions/min_terminated_length": 984.6, "entropy": 0.27419663667678834, "epoch": 4.071680376028202, "frac_reward_zero_std": 0.55, "grad_norm": 0.6924020648002625, "learning_rate": 9.583232372183183e-08, "loss": 0.0031, "num_tokens": 474250844.0, "reward": 0.9218750119209289, "reward_std": 0.07348827123641968, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9218750119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.1793515682220459, "sampling/importance_sampling_ratio/max": 1.9537212610244752, "sampling/importance_sampling_ratio/mean": 0.9998884320259094, "sampling/importance_sampling_ratio/min": 0.316797736287117, "sampling/sampling_logp_difference/max": 1.2259002208709717, "sampling/sampling_logp_difference/mean": 0.013919955492019654, "step": 3465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1587.4, "completions/max_terminated_length": 1587.4, "completions/mean_length": 1218.278125, "completions/mean_terminated_length": 1218.278125, "completions/min_length": 963.0, "completions/min_terminated_length": 963.0, "entropy": 0.2593968540430069, "epoch": 4.077555816686251, "frac_reward_zero_std": 0.65, "grad_norm": 0.521091103553772, "learning_rate": 9.522655682093531e-08, "loss": -0.0016, "num_tokens": 474963797.0, "reward": 0.8380208492279053, "reward_std": 0.08670372664928436, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8380208492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.2724744647741318, "sampling/importance_sampling_ratio/max": 1.9798192977905273, "sampling/importance_sampling_ratio/mean": 1.000128412246704, "sampling/importance_sampling_ratio/min": 0.38998249769210813, "sampling/sampling_logp_difference/max": 0.9741350650787354, "sampling/sampling_logp_difference/mean": 0.013235159032046796, "step": 3470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 1724.2, "completions/max_terminated_length": 1707.4, "completions/mean_length": 1228.634375, "completions/mean_terminated_length": 1221.1024169921875, "completions/min_length": 919.8, "completions/min_terminated_length": 919.8, "entropy": 0.2747196197509766, "epoch": 4.083431257344301, "frac_reward_zero_std": 0.75, "grad_norm": 0.4292539060115814, "learning_rate": 9.462078992003876e-08, "loss": -0.0207, "num_tokens": 475676008.0, "reward": 0.83125, "reward_std": 0.06916316822171212, "rewards/e2e_recall_precision_mixed_reward/mean": 0.831250011920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.2779860496520996, "sampling/importance_sampling_ratio/max": 1.9322911977767945, "sampling/importance_sampling_ratio/mean": 1.0001016497611999, "sampling/importance_sampling_ratio/min": 0.3249115705490112, "sampling/sampling_logp_difference/max": 1.2032855987548827, "sampling/sampling_logp_difference/mean": 0.013829777017235756, "step": 3475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1812.8, "completions/max_terminated_length": 1812.8, "completions/mean_length": 1300.915625, "completions/mean_terminated_length": 1300.915625, "completions/min_length": 1010.0, "completions/min_terminated_length": 1010.0, "entropy": 0.2737528860569, "epoch": 4.08930669800235, "frac_reward_zero_std": 0.55, "grad_norm": 0.8069294095039368, "learning_rate": 9.401502301914223e-08, "loss": 0.0012, "num_tokens": 476414573.0, "reward": 0.7744791626930236, "reward_std": 0.10844443291425705, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7744791746139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.3621998965740204, "sampling/importance_sampling_ratio/max": 1.9878645896911622, "sampling/importance_sampling_ratio/mean": 1.0000685095787047, "sampling/importance_sampling_ratio/min": 0.3133451998233795, "sampling/sampling_logp_difference/max": 1.2502954721450805, "sampling/sampling_logp_difference/mean": 0.013823360577225685, "step": 3480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1492.4, "completions/max_terminated_length": 1492.4, "completions/mean_length": 1175.41875, "completions/mean_terminated_length": 1175.41875, "completions/min_length": 953.0, "completions/min_terminated_length": 953.0, "entropy": 0.2621743202209473, "epoch": 4.0951821386604, "frac_reward_zero_std": 0.75, "grad_norm": 0.0, "learning_rate": 9.34092561182457e-08, "loss": -0.0049, "num_tokens": 477100547.0, "reward": 0.83203125, "reward_std": 0.034580792486667636, "rewards/e2e_recall_precision_mixed_reward/mean": 0.83203125, "rewards/e2e_recall_precision_mixed_reward/std": 0.2857617437839508, "sampling/importance_sampling_ratio/max": 1.9388158798217774, "sampling/importance_sampling_ratio/mean": 0.9999837040901184, "sampling/importance_sampling_ratio/min": 0.41814273595809937, "sampling/sampling_logp_difference/max": 1.050565242767334, "sampling/sampling_logp_difference/mean": 0.013383341580629348, "step": 3485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009375, "completions/max_length": 1808.8, "completions/max_terminated_length": 1793.2, "completions/mean_length": 1210.359375, "completions/mean_terminated_length": 1200.5919921875, "completions/min_length": 831.4, "completions/min_terminated_length": 831.4, "entropy": 0.25503125190734866, "epoch": 4.101057579318449, "frac_reward_zero_std": 0.5, "grad_norm": 0.0, "learning_rate": 9.280348921734917e-08, "loss": -0.0018, "num_tokens": 477827738.0, "reward": 0.7958333492279053, "reward_std": 0.0934183917939663, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7958333492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.26625967025756836, "sampling/importance_sampling_ratio/max": 1.9422680139541626, "sampling/importance_sampling_ratio/mean": 1.0000776171684265, "sampling/importance_sampling_ratio/min": 0.2969828426837921, "sampling/sampling_logp_difference/max": 1.300187087059021, "sampling/sampling_logp_difference/mean": 0.013157267309725284, "step": 3490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1592.0, "completions/max_terminated_length": 1592.0, "completions/mean_length": 1163.596875, "completions/mean_terminated_length": 1163.596875, "completions/min_length": 825.2, "completions/min_terminated_length": 825.2, "entropy": 0.25895902812480925, "epoch": 4.106933019976498, "frac_reward_zero_std": 0.65, "grad_norm": 0.5002437233924866, "learning_rate": 9.219772231645262e-08, "loss": -0.001, "num_tokens": 478509497.0, "reward": 0.7967187523841858, "reward_std": 0.06796298734843731, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7967187523841858, "rewards/e2e_recall_precision_mixed_reward/std": 0.2967051357030869, "sampling/importance_sampling_ratio/max": 1.9166396617889405, "sampling/importance_sampling_ratio/mean": 0.9999278783798218, "sampling/importance_sampling_ratio/min": 0.33549955785274505, "sampling/sampling_logp_difference/max": 1.154195499420166, "sampling/sampling_logp_difference/mean": 0.013070161268115044, "step": 3495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1446.2, "completions/max_terminated_length": 1446.2, "completions/mean_length": 1097.896875, "completions/mean_terminated_length": 1097.896875, "completions/min_length": 811.6, "completions/min_terminated_length": 811.6, "entropy": 0.26062892377376556, "epoch": 4.112808460634548, "frac_reward_zero_std": 0.7, "grad_norm": 0.6695868372917175, "learning_rate": 9.159195541555608e-08, "loss": 0.003, "num_tokens": 479184440.0, "reward": 0.79411461353302, "reward_std": 0.06309830695390702, "rewards/e2e_recall_precision_mixed_reward/mean": 0.79411461353302, "rewards/e2e_recall_precision_mixed_reward/std": 0.2844283878803253, "sampling/importance_sampling_ratio/max": 1.9598466873168945, "sampling/importance_sampling_ratio/mean": 1.0000836849212646, "sampling/importance_sampling_ratio/min": 0.33525398969650266, "sampling/sampling_logp_difference/max": 1.1045416712760925, "sampling/sampling_logp_difference/mean": 0.013297425210475921, "step": 3500 }, { "epoch": 4.112808460634548, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1565.68, "eval_completions/max_terminated_length": 1565.68, "eval_completions/mean_length": 1154.865, "eval_completions/mean_terminated_length": 1154.865, "eval_completions/min_length": 871.24, "eval_completions/min_terminated_length": 871.24, "eval_entropy": 0.2613798928260803, "eval_frac_reward_zero_std": 0.61, "eval_loss": 0.0025717311073094606, "eval_num_tokens": 479184440.0, "eval_reward": 0.7733750081062317, "eval_reward_std": 0.07942094504833222, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7733750081062317, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.2978187209367752, "eval_runtime": 429.2546, "eval_samples_per_second": 0.233, "eval_sampling/importance_sampling_ratio/max": 1.929530372619629, "eval_sampling/importance_sampling_ratio/mean": 0.999969162940979, "eval_sampling/importance_sampling_ratio/min": 0.3072571662068367, "eval_sampling/sampling_logp_difference/max": 1.3240701341629029, "eval_sampling/sampling_logp_difference/mean": 0.013397705145180225, "eval_steps_per_second": 0.005, "step": 3500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1559.0, "completions/max_terminated_length": 1559.0, "completions/mean_length": 1148.940625, "completions/mean_terminated_length": 1148.940625, "completions/min_length": 830.6, "completions/min_terminated_length": 830.6, "entropy": 0.25160637497901917, "epoch": 4.118683901292597, "frac_reward_zero_std": 0.55, "grad_norm": 0.6452022790908813, "learning_rate": 9.098618851465956e-08, "loss": -0.0045, "num_tokens": 479876965.0, "reward": 0.8864583492279052, "reward_std": 0.07887421548366547, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8864583492279052, "rewards/e2e_recall_precision_mixed_reward/std": 0.19640893638134002, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000006639957428, "sampling/importance_sampling_ratio/min": 0.3257301330566406, "sampling/sampling_logp_difference/max": 1.2440511465072632, "sampling/sampling_logp_difference/mean": 0.013004663959145546, "step": 3505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1498.8, "completions/max_terminated_length": 1498.8, "completions/mean_length": 1141.73125, "completions/mean_terminated_length": 1141.73125, "completions/min_length": 901.0, "completions/min_terminated_length": 901.0, "entropy": 0.26218015551567075, "epoch": 4.124559341950646, "frac_reward_zero_std": 0.6, "grad_norm": 0.45590612292289734, "learning_rate": 9.038042161376302e-08, "loss": 0.0045, "num_tokens": 480554591.0, "reward": 0.8541666746139527, "reward_std": 0.077769835293293, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8541666746139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.28215756416320803, "sampling/importance_sampling_ratio/max": 1.9558951377868652, "sampling/importance_sampling_ratio/mean": 1.0000158309936524, "sampling/importance_sampling_ratio/min": 0.33649215698242185, "sampling/sampling_logp_difference/max": 1.1709180116653441, "sampling/sampling_logp_difference/mean": 0.013481209240853786, "step": 3510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1506.4, "completions/max_terminated_length": 1506.4, "completions/mean_length": 1168.775, "completions/mean_terminated_length": 1168.775, "completions/min_length": 920.2, "completions/min_terminated_length": 920.2, "entropy": 0.26049660742282865, "epoch": 4.130434782608695, "frac_reward_zero_std": 0.6, "grad_norm": 0.6427649259567261, "learning_rate": 8.977465471286649e-08, "loss": -0.0023, "num_tokens": 481269495.0, "reward": 0.686718761920929, "reward_std": 0.06895458400249481, "rewards/e2e_recall_precision_mixed_reward/mean": 0.686718761920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.3957706391811371, "sampling/importance_sampling_ratio/max": 1.9636125326156617, "sampling/importance_sampling_ratio/mean": 1.0000577330589295, "sampling/importance_sampling_ratio/min": 0.28597378432750703, "sampling/sampling_logp_difference/max": 1.266392183303833, "sampling/sampling_logp_difference/mean": 0.013294227421283722, "step": 3515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1519.8, "completions/max_terminated_length": 1519.8, "completions/mean_length": 1163.378125, "completions/mean_terminated_length": 1163.378125, "completions/min_length": 854.4, "completions/min_terminated_length": 854.4, "entropy": 0.24826107621192933, "epoch": 4.136310223266745, "frac_reward_zero_std": 0.65, "grad_norm": 0.4001827836036682, "learning_rate": 8.916888781196995e-08, "loss": -0.0018, "num_tokens": 481950032.0, "reward": 0.8513020873069763, "reward_std": 0.0650397665798664, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8513020873069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.19886459708213805, "sampling/importance_sampling_ratio/max": 1.950411081314087, "sampling/importance_sampling_ratio/mean": 1.0000142931938172, "sampling/importance_sampling_ratio/min": 0.2998376667499542, "sampling/sampling_logp_difference/max": 1.3086572647094727, "sampling/sampling_logp_difference/mean": 0.012804117053747177, "step": 3520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1745.4, "completions/max_terminated_length": 1745.4, "completions/mean_length": 1194.35625, "completions/mean_terminated_length": 1194.35625, "completions/min_length": 909.2, "completions/min_terminated_length": 909.2, "entropy": 0.2595696121454239, "epoch": 4.142185663924795, "frac_reward_zero_std": 0.7, "grad_norm": 0.41437503695487976, "learning_rate": 8.856312091107341e-08, "loss": -0.0007, "num_tokens": 482659890.0, "reward": 0.8723958492279053, "reward_std": 0.04596400782465935, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8723958492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.2494454562664032, "sampling/importance_sampling_ratio/max": 1.95485520362854, "sampling/importance_sampling_ratio/mean": 0.999992847442627, "sampling/importance_sampling_ratio/min": 0.40674508810043336, "sampling/sampling_logp_difference/max": 0.9346871614456177, "sampling/sampling_logp_difference/mean": 0.013200496323406696, "step": 3525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1726.0, "completions/max_terminated_length": 1726.0, "completions/mean_length": 1188.909375, "completions/mean_terminated_length": 1188.909375, "completions/min_length": 874.4, "completions/min_terminated_length": 874.4, "entropy": 0.2728053092956543, "epoch": 4.148061104582844, "frac_reward_zero_std": 0.55, "grad_norm": 0.7083518505096436, "learning_rate": 8.795735401017689e-08, "loss": 0.0038, "num_tokens": 483374773.0, "reward": 0.8651041865348816, "reward_std": 0.07273668944835662, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8651041865348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.17529991567134856, "sampling/importance_sampling_ratio/max": 1.9552690982818604, "sampling/importance_sampling_ratio/mean": 0.9999983310699463, "sampling/importance_sampling_ratio/min": 0.3792727530002594, "sampling/sampling_logp_difference/max": 0.9897878289222717, "sampling/sampling_logp_difference/mean": 0.013795130141079425, "step": 3530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1658.8, "completions/max_terminated_length": 1658.8, "completions/mean_length": 1235.453125, "completions/mean_terminated_length": 1235.453125, "completions/min_length": 992.8, "completions/min_terminated_length": 992.8, "entropy": 0.27267765402793886, "epoch": 4.153936545240893, "frac_reward_zero_std": 0.65, "grad_norm": 0.3547857999801636, "learning_rate": 8.735158710928034e-08, "loss": -0.0013, "num_tokens": 484084614.0, "reward": 0.8630208611488343, "reward_std": 0.05130138620734215, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8630208611488343, "rewards/e2e_recall_precision_mixed_reward/std": 0.20890597105026246, "sampling/importance_sampling_ratio/max": 1.8556790113449098, "sampling/importance_sampling_ratio/mean": 0.9998457551002502, "sampling/importance_sampling_ratio/min": 0.39961166977882384, "sampling/sampling_logp_difference/max": 0.9650723934173584, "sampling/sampling_logp_difference/mean": 0.013754782639443875, "step": 3535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1705.2, "completions/max_terminated_length": 1705.2, "completions/mean_length": 1189.584375, "completions/mean_terminated_length": 1189.584375, "completions/min_length": 834.8, "completions/min_terminated_length": 834.8, "entropy": 0.2683590054512024, "epoch": 4.159811985898942, "frac_reward_zero_std": 0.6, "grad_norm": 0.7073742151260376, "learning_rate": 8.674582020838381e-08, "loss": 0.009, "num_tokens": 484785729.0, "reward": 0.9041666746139526, "reward_std": 0.06466917842626571, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9041666746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.16571083962917327, "sampling/importance_sampling_ratio/max": 1.9675647020339966, "sampling/importance_sampling_ratio/mean": 1.0000088930130004, "sampling/importance_sampling_ratio/min": 0.28218771507963536, "sampling/sampling_logp_difference/max": 2.2028767108917235, "sampling/sampling_logp_difference/mean": 0.013767124712467193, "step": 3540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1502.6, "completions/max_terminated_length": 1502.6, "completions/mean_length": 1172.46875, "completions/mean_terminated_length": 1172.46875, "completions/min_length": 920.4, "completions/min_terminated_length": 920.4, "entropy": 0.24803606569766998, "epoch": 4.165687426556992, "frac_reward_zero_std": 0.6, "grad_norm": 0.0, "learning_rate": 8.614005330748728e-08, "loss": -0.0028, "num_tokens": 485459031.0, "reward": 0.9213541865348815, "reward_std": 0.07192991301417351, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9213541865348815, "rewards/e2e_recall_precision_mixed_reward/std": 0.12924561873078347, "sampling/importance_sampling_ratio/max": 1.9395395040512085, "sampling/importance_sampling_ratio/mean": 0.9998730182647705, "sampling/importance_sampling_ratio/min": 0.41810473799705505, "sampling/sampling_logp_difference/max": 1.0929707050323487, "sampling/sampling_logp_difference/mean": 0.012722009792923927, "step": 3545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1635.4, "completions/max_terminated_length": 1635.4, "completions/mean_length": 1199.584375, "completions/mean_terminated_length": 1199.584375, "completions/min_length": 878.0, "completions/min_terminated_length": 878.0, "entropy": 0.27304658889770506, "epoch": 4.171562867215041, "frac_reward_zero_std": 0.6, "grad_norm": 0.6417668461799622, "learning_rate": 8.553428640659074e-08, "loss": 0.0015, "num_tokens": 486185538.0, "reward": 0.8311979174613953, "reward_std": 0.09750491976737977, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8311979174613953, "rewards/e2e_recall_precision_mixed_reward/std": 0.30009780526161195, "sampling/importance_sampling_ratio/max": 1.9971216201782227, "sampling/importance_sampling_ratio/mean": 0.9999606013298035, "sampling/importance_sampling_ratio/min": 0.3398262977600098, "sampling/sampling_logp_difference/max": 1.1382944583892822, "sampling/sampling_logp_difference/mean": 0.014073985256254673, "step": 3550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1537.2, "completions/max_terminated_length": 1537.2, "completions/mean_length": 1147.121875, "completions/mean_terminated_length": 1147.121875, "completions/min_length": 831.2, "completions/min_terminated_length": 831.2, "entropy": 0.2567810148000717, "epoch": 4.17743830787309, "frac_reward_zero_std": 0.6, "grad_norm": 0.4355989694595337, "learning_rate": 8.49285195056942e-08, "loss": 0.0033, "num_tokens": 486886841.0, "reward": 0.9223958373069763, "reward_std": 0.07777083888649941, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9223958373069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.16122473627328873, "sampling/importance_sampling_ratio/max": 1.9888453960418702, "sampling/importance_sampling_ratio/mean": 0.9999064683914185, "sampling/importance_sampling_ratio/min": 0.31339283287525177, "sampling/sampling_logp_difference/max": 1.2887135982513427, "sampling/sampling_logp_difference/mean": 0.013416317850351333, "step": 3555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1468.4, "completions/max_terminated_length": 1468.4, "completions/mean_length": 1161.878125, "completions/mean_terminated_length": 1161.878125, "completions/min_length": 911.8, "completions/min_terminated_length": 911.8, "entropy": 0.2630442798137665, "epoch": 4.18331374853114, "frac_reward_zero_std": 0.6, "grad_norm": 0.8106993436813354, "learning_rate": 8.432275260479766e-08, "loss": 0.0069, "num_tokens": 487599474.0, "reward": 0.8911458492279053, "reward_std": 0.09857227653265, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8911458492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.18993451595306396, "sampling/importance_sampling_ratio/max": 1.9875691890716554, "sampling/importance_sampling_ratio/mean": 1.0000635027885436, "sampling/importance_sampling_ratio/min": 0.3256483495235443, "sampling/sampling_logp_difference/max": 1.2611011028289796, "sampling/sampling_logp_difference/mean": 0.013434172235429287, "step": 3560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1660.2, "completions/max_terminated_length": 1660.2, "completions/mean_length": 1159.6125, "completions/mean_terminated_length": 1159.6125, "completions/min_length": 869.2, "completions/min_terminated_length": 869.2, "entropy": 0.257218137383461, "epoch": 4.1891891891891895, "frac_reward_zero_std": 0.85, "grad_norm": 0.4585232436656952, "learning_rate": 8.371698570390114e-08, "loss": 0.002, "num_tokens": 488298678.0, "reward": 0.8380208373069763, "reward_std": 0.02569769471883774, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8380208373069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.23604546785354613, "sampling/importance_sampling_ratio/max": 1.859304928779602, "sampling/importance_sampling_ratio/mean": 1.000068199634552, "sampling/importance_sampling_ratio/min": 0.3907679319381714, "sampling/sampling_logp_difference/max": 1.0408684730529785, "sampling/sampling_logp_difference/mean": 0.013181830570101739, "step": 3565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1548.4, "completions/max_terminated_length": 1548.4, "completions/mean_length": 1177.284375, "completions/mean_terminated_length": 1177.284375, "completions/min_length": 862.0, "completions/min_terminated_length": 862.0, "entropy": 0.2626467883586884, "epoch": 4.195064629847239, "frac_reward_zero_std": 0.55, "grad_norm": 0.0, "learning_rate": 8.31112188030046e-08, "loss": 0.0047, "num_tokens": 488978465.0, "reward": 0.8786458373069763, "reward_std": 0.09802740439772606, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8786458373069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.2111774556338787, "sampling/importance_sampling_ratio/max": 1.8543755054473876, "sampling/importance_sampling_ratio/mean": 0.999963641166687, "sampling/importance_sampling_ratio/min": 0.4706719875335693, "sampling/sampling_logp_difference/max": 0.8755661010742187, "sampling/sampling_logp_difference/mean": 0.013379019685089588, "step": 3570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1575.2, "completions/max_terminated_length": 1575.2, "completions/mean_length": 1152.871875, "completions/mean_terminated_length": 1152.871875, "completions/min_length": 889.2, "completions/min_terminated_length": 889.2, "entropy": 0.26695799827575684, "epoch": 4.200940070505288, "frac_reward_zero_std": 0.75, "grad_norm": 0.4237537682056427, "learning_rate": 8.250545190210805e-08, "loss": 0.0024, "num_tokens": 489668328.0, "reward": 0.8729166746139526, "reward_std": 0.05412452816963196, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8729166746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.20219584852457045, "sampling/importance_sampling_ratio/max": 1.9795571327209474, "sampling/importance_sampling_ratio/mean": 1.0000197649002076, "sampling/importance_sampling_ratio/min": 0.3312128663063049, "sampling/sampling_logp_difference/max": 1.2004522442817689, "sampling/sampling_logp_difference/mean": 0.01357947289943695, "step": 3575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1690.4, "completions/max_terminated_length": 1690.4, "completions/mean_length": 1177.5125, "completions/mean_terminated_length": 1177.5125, "completions/min_length": 875.4, "completions/min_terminated_length": 875.4, "entropy": 0.25332061350345614, "epoch": 4.206815511163337, "frac_reward_zero_std": 0.6, "grad_norm": 0.6305985450744629, "learning_rate": 8.189968500121153e-08, "loss": 0.0012, "num_tokens": 490368364.0, "reward": 0.8390104293823242, "reward_std": 0.09466763809323311, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8390104293823242, "rewards/e2e_recall_precision_mixed_reward/std": 0.2736502140760422, "sampling/importance_sampling_ratio/max": 1.9991877317428588, "sampling/importance_sampling_ratio/mean": 0.9999727964401245, "sampling/importance_sampling_ratio/min": 0.32218090295791624, "sampling/sampling_logp_difference/max": 1.1511583924293518, "sampling/sampling_logp_difference/mean": 0.01315567884594202, "step": 3580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1541.0, "completions/max_terminated_length": 1541.0, "completions/mean_length": 1184.74375, "completions/mean_terminated_length": 1184.74375, "completions/min_length": 935.2, "completions/min_terminated_length": 935.2, "entropy": 0.24916426241397857, "epoch": 4.212690951821386, "frac_reward_zero_std": 0.65, "grad_norm": 0.37277522683143616, "learning_rate": 8.129391810031499e-08, "loss": 0.0012, "num_tokens": 491056298.0, "reward": 0.9276041746139526, "reward_std": 0.06645566001534461, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9276041746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.15502337217330933, "sampling/importance_sampling_ratio/max": 1.9811736583709716, "sampling/importance_sampling_ratio/mean": 1.000077986717224, "sampling/importance_sampling_ratio/min": 0.32719268798828127, "sampling/sampling_logp_difference/max": 1.180593204498291, "sampling/sampling_logp_difference/mean": 0.012911760807037353, "step": 3585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1603.0, "completions/max_terminated_length": 1603.0, "completions/mean_length": 1200.834375, "completions/mean_terminated_length": 1200.834375, "completions/min_length": 877.4, "completions/min_terminated_length": 877.4, "entropy": 0.2584104537963867, "epoch": 4.218566392479436, "frac_reward_zero_std": 0.6, "grad_norm": 0.5621435642242432, "learning_rate": 8.068815119941847e-08, "loss": 0.0027, "num_tokens": 491756277.0, "reward": 0.866406261920929, "reward_std": 0.07562874779105186, "rewards/e2e_recall_precision_mixed_reward/mean": 0.866406261920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.16670996248722075, "sampling/importance_sampling_ratio/max": 1.9320027589797975, "sampling/importance_sampling_ratio/mean": 1.000148606300354, "sampling/importance_sampling_ratio/min": 0.2562232553958893, "sampling/sampling_logp_difference/max": 1.4149065494537354, "sampling/sampling_logp_difference/mean": 0.01321282796561718, "step": 3590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1655.8, "completions/max_terminated_length": 1655.8, "completions/mean_length": 1167.93125, "completions/mean_terminated_length": 1167.93125, "completions/min_length": 866.0, "completions/min_terminated_length": 866.0, "entropy": 0.24021802842617035, "epoch": 4.224441833137485, "frac_reward_zero_std": 0.65, "grad_norm": 0.46955856680870056, "learning_rate": 8.008238429852192e-08, "loss": -0.0029, "num_tokens": 492476799.0, "reward": 0.7401562571525574, "reward_std": 0.06587436497211456, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7401562571525574, "rewards/e2e_recall_precision_mixed_reward/std": 0.2895350486040115, "sampling/importance_sampling_ratio/max": 1.9865763902664184, "sampling/importance_sampling_ratio/mean": 1.0000853419303894, "sampling/importance_sampling_ratio/min": 0.3269083648920059, "sampling/sampling_logp_difference/max": 1.1674981117248535, "sampling/sampling_logp_difference/mean": 0.01268553752452135, "step": 3595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1734.8, "completions/max_terminated_length": 1734.8, "completions/mean_length": 1220.34375, "completions/mean_terminated_length": 1220.34375, "completions/min_length": 886.4, "completions/min_terminated_length": 886.4, "entropy": 0.25528871417045595, "epoch": 4.230317273795535, "frac_reward_zero_std": 0.4, "grad_norm": 0.8392326235771179, "learning_rate": 7.947661739762538e-08, "loss": 0.0063, "num_tokens": 493210509.0, "reward": 0.7765625238418579, "reward_std": 0.11853159815073014, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7765625238418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.2417447656393051, "sampling/importance_sampling_ratio/max": 1.9319365262985229, "sampling/importance_sampling_ratio/mean": 0.9999574422836304, "sampling/importance_sampling_ratio/min": 0.38794071674346925, "sampling/sampling_logp_difference/max": 1.0534749269485473, "sampling/sampling_logp_difference/mean": 0.01314362119883299, "step": 3600 }, { "epoch": 4.230317273795535, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1550.0, "eval_completions/max_terminated_length": 1550.0, "eval_completions/mean_length": 1137.6325, "eval_completions/mean_terminated_length": 1137.6325, "eval_completions/min_length": 865.32, "eval_completions/min_terminated_length": 865.32, "eval_entropy": 0.25953981578350066, "eval_frac_reward_zero_std": 0.63, "eval_loss": 0.0015070197405293584, "eval_num_tokens": 493210509.0, "eval_reward": 0.7727812600135803, "eval_reward_std": 0.07691708654165268, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7727812600135803, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.29146748542785644, "eval_runtime": 426.6542, "eval_samples_per_second": 0.234, "eval_sampling/importance_sampling_ratio/max": 1.9408579063415528, "eval_sampling/importance_sampling_ratio/mean": 1.0000195336341857, "eval_sampling/importance_sampling_ratio/min": 0.315677208006382, "eval_sampling/sampling_logp_difference/max": 1.3441522383689881, "eval_sampling/sampling_logp_difference/mean": 0.01335240513086319, "eval_steps_per_second": 0.005, "step": 3600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1525.4, "completions/max_terminated_length": 1525.4, "completions/mean_length": 1141.89375, "completions/mean_terminated_length": 1141.89375, "completions/min_length": 858.8, "completions/min_terminated_length": 858.8, "entropy": 0.2490395724773407, "epoch": 4.236192714453584, "frac_reward_zero_std": 0.65, "grad_norm": 0.6798803210258484, "learning_rate": 7.887085049672886e-08, "loss": 0.0002, "num_tokens": 493887211.0, "reward": 0.8026041746139526, "reward_std": 0.06693809628486633, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8026041746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.2717654824256897, "sampling/importance_sampling_ratio/max": 1.9659399032592773, "sampling/importance_sampling_ratio/mean": 1.000072717666626, "sampling/importance_sampling_ratio/min": 0.3628074645996094, "sampling/sampling_logp_difference/max": 1.1372278690338136, "sampling/sampling_logp_difference/mean": 0.012980341352522374, "step": 3605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1651.6, "completions/max_terminated_length": 1651.6, "completions/mean_length": 1181.6375, "completions/mean_terminated_length": 1181.6375, "completions/min_length": 887.4, "completions/min_terminated_length": 887.4, "entropy": 0.250617590546608, "epoch": 4.2420681551116335, "frac_reward_zero_std": 0.65, "grad_norm": 0.5288187861442566, "learning_rate": 7.826508359583232e-08, "loss": 0.0018, "num_tokens": 494578743.0, "reward": 0.8558333516120911, "reward_std": 0.08396902829408645, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8558333516120911, "rewards/e2e_recall_precision_mixed_reward/std": 0.1968195393681526, "sampling/importance_sampling_ratio/max": 1.8700719356536866, "sampling/importance_sampling_ratio/mean": 0.999861991405487, "sampling/importance_sampling_ratio/min": 0.3520336002111435, "sampling/sampling_logp_difference/max": 1.1078625679016114, "sampling/sampling_logp_difference/mean": 0.013078899681568145, "step": 3610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1537.2, "completions/max_terminated_length": 1537.2, "completions/mean_length": 1147.86875, "completions/mean_terminated_length": 1147.86875, "completions/min_length": 904.4, "completions/min_terminated_length": 904.4, "entropy": 0.25358888506889343, "epoch": 4.247943595769683, "frac_reward_zero_std": 0.6, "grad_norm": 0.7324259281158447, "learning_rate": 7.76593166949358e-08, "loss": -0.0012, "num_tokens": 495258845.0, "reward": 0.8223958373069763, "reward_std": 0.06487023383378983, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8223958373069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.27769546806812284, "sampling/importance_sampling_ratio/max": 1.9683610439300536, "sampling/importance_sampling_ratio/mean": 0.9999585747718811, "sampling/importance_sampling_ratio/min": 0.36560204029083254, "sampling/sampling_logp_difference/max": 1.239591932296753, "sampling/sampling_logp_difference/mean": 0.01286852192133665, "step": 3615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1655.6, "completions/max_terminated_length": 1655.6, "completions/mean_length": 1202.1, "completions/mean_terminated_length": 1202.1, "completions/min_length": 888.2, "completions/min_terminated_length": 888.2, "entropy": 0.26086442470550536, "epoch": 4.253819036427732, "frac_reward_zero_std": 0.7, "grad_norm": 0.38992178440093994, "learning_rate": 7.705354979403925e-08, "loss": -0.0012, "num_tokens": 495961789.0, "reward": 0.8336458563804626, "reward_std": 0.06623862236738205, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8336458563804626, "rewards/e2e_recall_precision_mixed_reward/std": 0.25836754143238067, "sampling/importance_sampling_ratio/max": 1.9335944414138795, "sampling/importance_sampling_ratio/mean": 1.0000713109970092, "sampling/importance_sampling_ratio/min": 0.3970839321613312, "sampling/sampling_logp_difference/max": 0.9824123978614807, "sampling/sampling_logp_difference/mean": 0.013319226913154125, "step": 3620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1806.0, "completions/max_terminated_length": 1806.0, "completions/mean_length": 1218.45625, "completions/mean_terminated_length": 1218.45625, "completions/min_length": 847.2, "completions/min_terminated_length": 847.2, "entropy": 0.2762126445770264, "epoch": 4.259694477085781, "frac_reward_zero_std": 0.55, "grad_norm": 0.7492891550064087, "learning_rate": 7.644778289314271e-08, "loss": 0.001, "num_tokens": 496669871.0, "reward": 0.744531261920929, "reward_std": 0.11009465903043747, "rewards/e2e_recall_precision_mixed_reward/mean": 0.744531261920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.34771095514297484, "sampling/importance_sampling_ratio/max": 1.9903298139572143, "sampling/importance_sampling_ratio/mean": 1.0000926017761231, "sampling/importance_sampling_ratio/min": 0.3699575960636139, "sampling/sampling_logp_difference/max": 1.0518778800964355, "sampling/sampling_logp_difference/mean": 0.014002586342394352, "step": 3625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1431.6, "completions/max_terminated_length": 1431.6, "completions/mean_length": 1137.309375, "completions/mean_terminated_length": 1137.309375, "completions/min_length": 898.8, "completions/min_terminated_length": 898.8, "entropy": 0.23710068166255951, "epoch": 4.26556991774383, "frac_reward_zero_std": 0.65, "grad_norm": 1.1419117450714111, "learning_rate": 7.584201599224618e-08, "loss": 0.003, "num_tokens": 497322610.0, "reward": 0.9127604246139527, "reward_std": 0.08629101514816284, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9127604246139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.16239862740039826, "sampling/importance_sampling_ratio/max": 1.997536063194275, "sampling/importance_sampling_ratio/mean": 1.0000044703483582, "sampling/importance_sampling_ratio/min": 0.3823740020394325, "sampling/sampling_logp_difference/max": 1.238282561302185, "sampling/sampling_logp_difference/mean": 0.012295803800225259, "step": 3630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1417.4, "completions/max_terminated_length": 1417.4, "completions/mean_length": 1097.246875, "completions/mean_terminated_length": 1097.246875, "completions/min_length": 780.8, "completions/min_terminated_length": 780.8, "entropy": 0.24801380932331085, "epoch": 4.2714453584018806, "frac_reward_zero_std": 0.6, "grad_norm": 0.6945148706436157, "learning_rate": 7.523624909134965e-08, "loss": -0.002, "num_tokens": 497989201.0, "reward": 0.8451562523841858, "reward_std": 0.07065275609493256, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8451562523841858, "rewards/e2e_recall_precision_mixed_reward/std": 0.2394854724407196, "sampling/importance_sampling_ratio/max": 1.8666242361068726, "sampling/importance_sampling_ratio/mean": 1.0000623703002929, "sampling/importance_sampling_ratio/min": 0.357879763841629, "sampling/sampling_logp_difference/max": 1.413445281982422, "sampling/sampling_logp_difference/mean": 0.012836653739213943, "step": 3635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 1744.6, "completions/max_terminated_length": 1741.6, "completions/mean_length": 1198.328125, "completions/mean_terminated_length": 1182.7527099609374, "completions/min_length": 860.8, "completions/min_terminated_length": 860.8, "entropy": 0.2554967701435089, "epoch": 4.27732079905993, "frac_reward_zero_std": 0.7, "grad_norm": 0.0, "learning_rate": 7.463048219045311e-08, "loss": -0.009, "num_tokens": 498713450.0, "reward": 0.6994791865348816, "reward_std": 0.06559417322278023, "rewards/e2e_recall_precision_mixed_reward/mean": 0.6994791865348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.3261187255382538, "sampling/importance_sampling_ratio/max": 1.915812611579895, "sampling/importance_sampling_ratio/mean": 1.0000962734222412, "sampling/importance_sampling_ratio/min": 0.37440991401672363, "sampling/sampling_logp_difference/max": 1.04357990026474, "sampling/sampling_logp_difference/mean": 0.013260076381266117, "step": 3640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1707.8, "completions/max_terminated_length": 1707.8, "completions/mean_length": 1165.36875, "completions/mean_terminated_length": 1165.36875, "completions/min_length": 819.6, "completions/min_terminated_length": 819.6, "entropy": 0.2641367554664612, "epoch": 4.283196239717979, "frac_reward_zero_std": 0.65, "grad_norm": 0.5784282684326172, "learning_rate": 7.402471528955657e-08, "loss": 0.004, "num_tokens": 499375088.0, "reward": 0.8260937690734863, "reward_std": 0.06488855034112931, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8260937690734863, "rewards/e2e_recall_precision_mixed_reward/std": 0.2198262929916382, "sampling/importance_sampling_ratio/max": 1.9456058263778686, "sampling/importance_sampling_ratio/mean": 1.0000360012054443, "sampling/importance_sampling_ratio/min": 0.42152239084243776, "sampling/sampling_logp_difference/max": 0.9165781736373901, "sampling/sampling_logp_difference/mean": 0.013488280028104782, "step": 3645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1782.0, "completions/max_terminated_length": 1782.0, "completions/mean_length": 1211.425, "completions/mean_terminated_length": 1211.425, "completions/min_length": 864.4, "completions/min_terminated_length": 864.4, "entropy": 0.261778736114502, "epoch": 4.289071680376028, "frac_reward_zero_std": 0.55, "grad_norm": 0.5044363141059875, "learning_rate": 7.341894838866005e-08, "loss": -0.0007, "num_tokens": 500075528.0, "reward": 0.8682291865348816, "reward_std": 0.10688243508338928, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8682291865348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.25191566050052644, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001546621322632, "sampling/importance_sampling_ratio/min": 0.3008132725954056, "sampling/sampling_logp_difference/max": 1.3812680006027223, "sampling/sampling_logp_difference/mean": 0.013221434317529202, "step": 3650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1432.8, "completions/max_terminated_length": 1432.8, "completions/mean_length": 1110.65, "completions/mean_terminated_length": 1110.65, "completions/min_length": 852.6, "completions/min_terminated_length": 852.6, "entropy": 0.2591015428304672, "epoch": 4.2949471210340775, "frac_reward_zero_std": 0.65, "grad_norm": 0.6536259055137634, "learning_rate": 7.281318148776351e-08, "loss": 0.0039, "num_tokens": 500752504.0, "reward": 0.9790624976158142, "reward_std": 0.048888879269361495, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9790624976158142, "rewards/e2e_recall_precision_mixed_reward/std": 0.06051587462425232, "sampling/importance_sampling_ratio/max": 1.9475368022918702, "sampling/importance_sampling_ratio/mean": 1.0000488758087158, "sampling/importance_sampling_ratio/min": 0.32482802011072637, "sampling/sampling_logp_difference/max": 1.5002553701400756, "sampling/sampling_logp_difference/mean": 0.013194483146071434, "step": 3655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1588.0, "completions/max_terminated_length": 1588.0, "completions/mean_length": 1174.31875, "completions/mean_terminated_length": 1174.31875, "completions/min_length": 860.4, "completions/min_terminated_length": 860.4, "entropy": 0.24145943224430083, "epoch": 4.300822561692127, "frac_reward_zero_std": 0.65, "grad_norm": 0.3969491720199585, "learning_rate": 7.220741458686696e-08, "loss": 0.0052, "num_tokens": 501458510.0, "reward": 0.8920312762260437, "reward_std": 0.053078722581267355, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8920312881469726, "rewards/e2e_recall_precision_mixed_reward/std": 0.1960427224636078, "sampling/importance_sampling_ratio/max": 1.921673846244812, "sampling/importance_sampling_ratio/mean": 1.0000031232833861, "sampling/importance_sampling_ratio/min": 0.39288265705108644, "sampling/sampling_logp_difference/max": 1.1308459281921386, "sampling/sampling_logp_difference/mean": 0.012670677155256271, "step": 3660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1531.8, "completions/max_terminated_length": 1531.8, "completions/mean_length": 1174.265625, "completions/mean_terminated_length": 1174.265625, "completions/min_length": 933.4, "completions/min_terminated_length": 933.4, "entropy": 0.2668359637260437, "epoch": 4.306698002350176, "frac_reward_zero_std": 0.5, "grad_norm": 0.930717408657074, "learning_rate": 7.160164768597044e-08, "loss": 0.004, "num_tokens": 502164675.0, "reward": 0.8020833373069763, "reward_std": 0.09576954618096352, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8020833373069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.2947723791003227, "sampling/importance_sampling_ratio/max": 1.946074938774109, "sampling/importance_sampling_ratio/mean": 1.000030016899109, "sampling/importance_sampling_ratio/min": 0.38857103884220123, "sampling/sampling_logp_difference/max": 1.1057616233825684, "sampling/sampling_logp_difference/mean": 0.01362884696573019, "step": 3665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1576.2, "completions/max_terminated_length": 1576.2, "completions/mean_length": 1180.81875, "completions/mean_terminated_length": 1180.81875, "completions/min_length": 872.8, "completions/min_terminated_length": 872.8, "entropy": 0.25552141666412354, "epoch": 4.312573443008225, "frac_reward_zero_std": 0.7, "grad_norm": 0.585945188999176, "learning_rate": 7.09958807850739e-08, "loss": -0.0017, "num_tokens": 502853849.0, "reward": 0.9364583373069764, "reward_std": 0.055623647570610044, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9364583373069764, "rewards/e2e_recall_precision_mixed_reward/std": 0.1332593083381653, "sampling/importance_sampling_ratio/max": 1.8885759353637694, "sampling/importance_sampling_ratio/mean": 1.000054121017456, "sampling/importance_sampling_ratio/min": 0.4013862669467926, "sampling/sampling_logp_difference/max": 0.9747562885284424, "sampling/sampling_logp_difference/mean": 0.01286784913390875, "step": 3670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1527.0, "completions/max_terminated_length": 1527.0, "completions/mean_length": 1141.078125, "completions/mean_terminated_length": 1141.078125, "completions/min_length": 849.0, "completions/min_terminated_length": 849.0, "entropy": 0.2603608280420303, "epoch": 4.318448883666275, "frac_reward_zero_std": 0.65, "grad_norm": 0.730247437953949, "learning_rate": 7.039011388417738e-08, "loss": -0.0032, "num_tokens": 503527554.0, "reward": 0.9072916865348816, "reward_std": 0.06838146299123764, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9072916865348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.20818188190460205, "sampling/importance_sampling_ratio/max": 1.9301481246948242, "sampling/importance_sampling_ratio/mean": 1.000066590309143, "sampling/importance_sampling_ratio/min": 0.24810873121023178, "sampling/sampling_logp_difference/max": 1.7007533073425294, "sampling/sampling_logp_difference/mean": 0.013547290675342083, "step": 3675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1607.8, "completions/max_terminated_length": 1607.8, "completions/mean_length": 1213.71875, "completions/mean_terminated_length": 1213.71875, "completions/min_length": 994.8, "completions/min_terminated_length": 994.8, "entropy": 0.2658453106880188, "epoch": 4.324324324324325, "frac_reward_zero_std": 0.9, "grad_norm": 0.0, "learning_rate": 6.978434698328083e-08, "loss": -0.0005, "num_tokens": 504220120.0, "reward": 0.9796875, "reward_std": 0.016568987071514128, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9796875, "rewards/e2e_recall_precision_mixed_reward/std": 0.05093269646167755, "sampling/importance_sampling_ratio/max": 1.8813456058502198, "sampling/importance_sampling_ratio/mean": 1.0000309467315673, "sampling/importance_sampling_ratio/min": 0.4121032416820526, "sampling/sampling_logp_difference/max": 1.1532811164855956, "sampling/sampling_logp_difference/mean": 0.013305356167256832, "step": 3680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1571.8, "completions/max_terminated_length": 1571.8, "completions/mean_length": 1158.6625, "completions/mean_terminated_length": 1158.6625, "completions/min_length": 868.2, "completions/min_terminated_length": 868.2, "entropy": 0.25527799427509307, "epoch": 4.330199764982374, "frac_reward_zero_std": 0.65, "grad_norm": 0.37966060638427734, "learning_rate": 6.917858008238429e-08, "loss": 0.0045, "num_tokens": 504920364.0, "reward": 0.7671875059604645, "reward_std": 0.08376320600509643, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7671875059604645, "rewards/e2e_recall_precision_mixed_reward/std": 0.2696637690067291, "sampling/importance_sampling_ratio/max": 1.9269562005996703, "sampling/importance_sampling_ratio/mean": 1.0000017642974854, "sampling/importance_sampling_ratio/min": 0.2865142642069486, "sampling/sampling_logp_difference/max": 4.005220603942871, "sampling/sampling_logp_difference/mean": 0.012985915131866931, "step": 3685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1684.8, "completions/max_terminated_length": 1684.8, "completions/mean_length": 1224.925, "completions/mean_terminated_length": 1224.925, "completions/min_length": 923.4, "completions/min_terminated_length": 923.4, "entropy": 0.24377625286579133, "epoch": 4.336075205640423, "frac_reward_zero_std": 0.55, "grad_norm": 0.342869371175766, "learning_rate": 6.857281318148777e-08, "loss": -0.0065, "num_tokens": 505618484.0, "reward": 0.845104169845581, "reward_std": 0.0953491523861885, "rewards/e2e_recall_precision_mixed_reward/mean": 0.845104169845581, "rewards/e2e_recall_precision_mixed_reward/std": 0.23468240201473237, "sampling/importance_sampling_ratio/max": 1.9759195566177368, "sampling/importance_sampling_ratio/mean": 1.0000083088874816, "sampling/importance_sampling_ratio/min": 0.29937623292207716, "sampling/sampling_logp_difference/max": 1.3291741847991942, "sampling/sampling_logp_difference/mean": 0.012640192732214927, "step": 3690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1699.6, "completions/max_terminated_length": 1699.6, "completions/mean_length": 1237.003125, "completions/mean_terminated_length": 1237.003125, "completions/min_length": 942.0, "completions/min_terminated_length": 942.0, "entropy": 0.26305500864982606, "epoch": 4.341950646298472, "frac_reward_zero_std": 0.5, "grad_norm": 0.7630984783172607, "learning_rate": 6.796704628059123e-08, "loss": -0.001, "num_tokens": 506308533.0, "reward": 0.7562500238418579, "reward_std": 0.10581399351358414, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7562500238418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.3192788898944855, "sampling/importance_sampling_ratio/max": 1.9842085123062134, "sampling/importance_sampling_ratio/mean": 0.9998934268951416, "sampling/importance_sampling_ratio/min": 0.4040143370628357, "sampling/sampling_logp_difference/max": 1.1104382038116456, "sampling/sampling_logp_difference/mean": 0.013264241628348827, "step": 3695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1701.8, "completions/max_terminated_length": 1701.8, "completions/mean_length": 1203.19375, "completions/mean_terminated_length": 1203.19375, "completions/min_length": 872.6, "completions/min_terminated_length": 872.6, "entropy": 0.2650981694459915, "epoch": 4.3478260869565215, "frac_reward_zero_std": 0.5, "grad_norm": 0.636228621006012, "learning_rate": 6.736127937969469e-08, "loss": 0.009, "num_tokens": 507004915.0, "reward": 0.870312511920929, "reward_std": 0.09012310206890106, "rewards/e2e_recall_precision_mixed_reward/mean": 0.870312511920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.19685865193605423, "sampling/importance_sampling_ratio/max": 1.9754976272583007, "sampling/importance_sampling_ratio/mean": 0.9999840497970581, "sampling/importance_sampling_ratio/min": 0.3514511287212372, "sampling/sampling_logp_difference/max": 1.0524065256118775, "sampling/sampling_logp_difference/mean": 0.013504279032349586, "step": 3700 }, { "epoch": 4.3478260869565215, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1591.76, "eval_completions/max_terminated_length": 1591.76, "eval_completions/mean_length": 1160.758125, "eval_completions/mean_terminated_length": 1160.758125, "eval_completions/min_length": 884.4, "eval_completions/min_terminated_length": 884.4, "eval_entropy": 0.2645591682195663, "eval_frac_reward_zero_std": 0.6, "eval_loss": 0.0012973687844350934, "eval_num_tokens": 507004915.0, "eval_reward": 0.7682916808128357, "eval_reward_std": 0.0784290412068367, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7682916808128357, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.2942594337463379, "eval_runtime": 436.9774, "eval_samples_per_second": 0.229, "eval_sampling/importance_sampling_ratio/max": 1.9532128953933716, "eval_sampling/importance_sampling_ratio/mean": 1.0000083541870117, "eval_sampling/importance_sampling_ratio/min": 0.3924976485967636, "eval_sampling/sampling_logp_difference/max": 1.033168478012085, "eval_sampling/sampling_logp_difference/mean": 0.013480137176811695, "eval_steps_per_second": 0.005, "step": 3700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1606.2, "completions/max_terminated_length": 1606.2, "completions/mean_length": 1227.14375, "completions/mean_terminated_length": 1227.14375, "completions/min_length": 927.6, "completions/min_terminated_length": 927.6, "entropy": 0.26960003972053526, "epoch": 4.353701527614571, "frac_reward_zero_std": 0.7, "grad_norm": 0.45175471901893616, "learning_rate": 6.675551247879815e-08, "loss": -0.0005, "num_tokens": 507726785.0, "reward": 0.8671875119209289, "reward_std": 0.05317651703953743, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8671875119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.232213281840086, "sampling/importance_sampling_ratio/max": 1.9744548320770263, "sampling/importance_sampling_ratio/mean": 0.999909496307373, "sampling/importance_sampling_ratio/min": 0.37495740652084353, "sampling/sampling_logp_difference/max": 1.0028222799301147, "sampling/sampling_logp_difference/mean": 0.013552324287593365, "step": 3705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1602.8, "completions/max_terminated_length": 1602.8, "completions/mean_length": 1243.771875, "completions/mean_terminated_length": 1243.771875, "completions/min_length": 975.8, "completions/min_terminated_length": 975.8, "entropy": 0.26458646953105924, "epoch": 4.35957696827262, "frac_reward_zero_std": 0.75, "grad_norm": 0.0, "learning_rate": 6.614974557790162e-08, "loss": -0.0037, "num_tokens": 508442760.0, "reward": 0.8958333492279053, "reward_std": 0.05056643486022949, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8958333492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.17827844023704528, "sampling/importance_sampling_ratio/max": 1.9635953903198242, "sampling/importance_sampling_ratio/mean": 1.0000174283981322, "sampling/importance_sampling_ratio/min": 0.3145808935165405, "sampling/sampling_logp_difference/max": 1.2495264530181884, "sampling/sampling_logp_difference/mean": 0.013446901552379131, "step": 3710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1497.2, "completions/max_terminated_length": 1497.2, "completions/mean_length": 1114.21875, "completions/mean_terminated_length": 1114.21875, "completions/min_length": 796.2, "completions/min_terminated_length": 796.2, "entropy": 0.24198752343654634, "epoch": 4.36545240893067, "frac_reward_zero_std": 0.55, "grad_norm": 0.5798568725585938, "learning_rate": 6.554397867700509e-08, "loss": 0.0015, "num_tokens": 509124622.0, "reward": 0.8671875119209289, "reward_std": 0.09523374810814858, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8671875119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.2366887092590332, "sampling/importance_sampling_ratio/max": 1.9578262090682983, "sampling/importance_sampling_ratio/mean": 0.9999819397926331, "sampling/importance_sampling_ratio/min": 0.35864190459251405, "sampling/sampling_logp_difference/max": 1.0817768573760986, "sampling/sampling_logp_difference/mean": 0.012588843144476414, "step": 3715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1620.8, "completions/max_terminated_length": 1620.8, "completions/mean_length": 1219.38125, "completions/mean_terminated_length": 1219.38125, "completions/min_length": 903.4, "completions/min_terminated_length": 903.4, "entropy": 0.25731444656848906, "epoch": 4.371327849588719, "frac_reward_zero_std": 0.6, "grad_norm": 0.0, "learning_rate": 6.493821177610854e-08, "loss": -0.0002, "num_tokens": 509831912.0, "reward": 0.8432291746139526, "reward_std": 0.06508480310440064, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8432291746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.22504698634147643, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000614643096923, "sampling/importance_sampling_ratio/min": 0.28552534580230715, "sampling/sampling_logp_difference/max": 1.281605863571167, "sampling/sampling_logp_difference/mean": 0.01319657452404499, "step": 3720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1544.8, "completions/max_terminated_length": 1544.8, "completions/mean_length": 1148.18125, "completions/mean_terminated_length": 1148.18125, "completions/min_length": 829.4, "completions/min_terminated_length": 829.4, "entropy": 0.2571199804544449, "epoch": 4.377203290246769, "frac_reward_zero_std": 0.7, "grad_norm": 0.48561370372772217, "learning_rate": 6.433244487521202e-08, "loss": -0.001, "num_tokens": 510500562.0, "reward": 0.9138020873069763, "reward_std": 0.05333668440580368, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9138020873069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.17001340240240098, "sampling/importance_sampling_ratio/max": 1.9414838314056397, "sampling/importance_sampling_ratio/mean": 1.0000270128250122, "sampling/importance_sampling_ratio/min": 0.41579994559288025, "sampling/sampling_logp_difference/max": 0.9531768798828125, "sampling/sampling_logp_difference/mean": 0.012977257929742336, "step": 3725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1919.6, "completions/max_terminated_length": 1919.6, "completions/mean_length": 1261.775, "completions/mean_terminated_length": 1261.775, "completions/min_length": 897.4, "completions/min_terminated_length": 897.4, "entropy": 0.2818825155496597, "epoch": 4.383078730904818, "frac_reward_zero_std": 0.75, "grad_norm": 0.5734153389930725, "learning_rate": 6.372667797431548e-08, "loss": -0.0026, "num_tokens": 511228778.0, "reward": 0.8471354365348815, "reward_std": 0.0608148567378521, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8471354365348815, "rewards/e2e_recall_precision_mixed_reward/std": 0.23871059119701385, "sampling/importance_sampling_ratio/max": 1.9525668382644654, "sampling/importance_sampling_ratio/mean": 1.0000926733016968, "sampling/importance_sampling_ratio/min": 0.37156358957290647, "sampling/sampling_logp_difference/max": 1.0653681755065918, "sampling/sampling_logp_difference/mean": 0.01405724585056305, "step": 3730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1603.4, "completions/max_terminated_length": 1603.4, "completions/mean_length": 1199.4875, "completions/mean_terminated_length": 1199.4875, "completions/min_length": 797.8, "completions/min_terminated_length": 797.8, "entropy": 0.2717233240604401, "epoch": 4.388954171562867, "frac_reward_zero_std": 0.65, "grad_norm": 0.44968000054359436, "learning_rate": 6.312091107341894e-08, "loss": 0.0036, "num_tokens": 511913702.0, "reward": 0.9526041746139526, "reward_std": 0.07319597527384758, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9526041746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.13536526411771774, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999484062194824, "sampling/importance_sampling_ratio/min": 0.24164845794462195, "sampling/sampling_logp_difference/max": 6.622607588768005, "sampling/sampling_logp_difference/mean": 0.01382777951657772, "step": 3735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1701.0, "completions/max_terminated_length": 1701.0, "completions/mean_length": 1223.628125, "completions/mean_terminated_length": 1223.628125, "completions/min_length": 923.8, "completions/min_terminated_length": 923.8, "entropy": 0.25062851011753084, "epoch": 4.394829612220916, "frac_reward_zero_std": 0.45, "grad_norm": 0.8033055663108826, "learning_rate": 6.251514417252241e-08, "loss": 0.0072, "num_tokens": 512615423.0, "reward": 0.8494791865348816, "reward_std": 0.11086350381374359, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8494791865348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.1987324446439743, "sampling/importance_sampling_ratio/max": 1.931475830078125, "sampling/importance_sampling_ratio/mean": 1.0000649571418763, "sampling/importance_sampling_ratio/min": 0.30528209507465365, "sampling/sampling_logp_difference/max": 1.3587350845336914, "sampling/sampling_logp_difference/mean": 0.013007241114974023, "step": 3740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1593.2, "completions/max_terminated_length": 1593.2, "completions/mean_length": 1212.203125, "completions/mean_terminated_length": 1212.203125, "completions/min_length": 946.6, "completions/min_terminated_length": 946.6, "entropy": 0.24510794878005981, "epoch": 4.4007050528789655, "frac_reward_zero_std": 0.75, "grad_norm": 0.45203477144241333, "learning_rate": 6.190937727162587e-08, "loss": 0.0007, "num_tokens": 513293504.0, "reward": 0.9322916746139527, "reward_std": 0.053071030974388124, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9322916746139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.16087620556354523, "sampling/importance_sampling_ratio/max": 1.987834596633911, "sampling/importance_sampling_ratio/mean": 0.9998769402503968, "sampling/importance_sampling_ratio/min": 0.33060061633586885, "sampling/sampling_logp_difference/max": 1.3648550748825072, "sampling/sampling_logp_difference/mean": 0.012442305870354175, "step": 3745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1676.0, "completions/max_terminated_length": 1676.0, "completions/mean_length": 1173.953125, "completions/mean_terminated_length": 1173.953125, "completions/min_length": 914.2, "completions/min_terminated_length": 914.2, "entropy": 0.2551372706890106, "epoch": 4.406580493537016, "frac_reward_zero_std": 0.5, "grad_norm": 0.4871174991130829, "learning_rate": 6.130361037072935e-08, "loss": -0.0072, "num_tokens": 513961489.0, "reward": 0.8958333492279053, "reward_std": 0.08676534816622734, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8958333492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.20122278928756715, "sampling/importance_sampling_ratio/max": 1.957563042640686, "sampling/importance_sampling_ratio/mean": 1.0001355171203614, "sampling/importance_sampling_ratio/min": 0.29272522777318954, "sampling/sampling_logp_difference/max": 1.3090601205825805, "sampling/sampling_logp_difference/mean": 0.013074627332389354, "step": 3750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1644.0, "completions/max_terminated_length": 1644.0, "completions/mean_length": 1176.628125, "completions/mean_terminated_length": 1176.628125, "completions/min_length": 876.6, "completions/min_terminated_length": 876.6, "entropy": 0.259324786067009, "epoch": 4.412455934195065, "frac_reward_zero_std": 0.6, "grad_norm": 0.6822929382324219, "learning_rate": 6.069784346983281e-08, "loss": 0.0024, "num_tokens": 514653978.0, "reward": 0.8907291769981385, "reward_std": 0.07878585755825043, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8907291769981385, "rewards/e2e_recall_precision_mixed_reward/std": 0.19454094469547273, "sampling/importance_sampling_ratio/max": 1.972559380531311, "sampling/importance_sampling_ratio/mean": 1.000023365020752, "sampling/importance_sampling_ratio/min": 0.19456153102219104, "sampling/sampling_logp_difference/max": 2.361952781677246, "sampling/sampling_logp_difference/mean": 0.01318011675029993, "step": 3755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1492.8, "completions/max_terminated_length": 1492.8, "completions/mean_length": 1159.703125, "completions/mean_terminated_length": 1159.703125, "completions/min_length": 855.6, "completions/min_terminated_length": 855.6, "entropy": 0.2610477864742279, "epoch": 4.418331374853114, "frac_reward_zero_std": 0.5, "grad_norm": 0.6041929125785828, "learning_rate": 6.009207656893627e-08, "loss": 0.0042, "num_tokens": 515357595.0, "reward": 0.7888021111488343, "reward_std": 0.09884281009435654, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7888021111488343, "rewards/e2e_recall_precision_mixed_reward/std": 0.31391807496547697, "sampling/importance_sampling_ratio/max": 1.9285221576690674, "sampling/importance_sampling_ratio/mean": 1.0000345706939697, "sampling/importance_sampling_ratio/min": 0.4189653038978577, "sampling/sampling_logp_difference/max": 0.9033971548080444, "sampling/sampling_logp_difference/mean": 0.013369284197688102, "step": 3760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1598.0, "completions/max_terminated_length": 1598.0, "completions/mean_length": 1193.55625, "completions/mean_terminated_length": 1193.55625, "completions/min_length": 891.8, "completions/min_terminated_length": 891.8, "entropy": 0.25302115380764006, "epoch": 4.424206815511163, "frac_reward_zero_std": 0.2, "grad_norm": 0.7945708632469177, "learning_rate": 5.9486309668039735e-08, "loss": 0.0011, "num_tokens": 516055325.0, "reward": 0.7338541746139526, "reward_std": 0.15235694646835327, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7338541746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.3311033874750137, "sampling/importance_sampling_ratio/max": 1.8949079275131226, "sampling/importance_sampling_ratio/mean": 0.9999711632728576, "sampling/importance_sampling_ratio/min": 0.3348624587059021, "sampling/sampling_logp_difference/max": 1.168335199356079, "sampling/sampling_logp_difference/mean": 0.01294925194233656, "step": 3765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1564.2, "completions/max_terminated_length": 1564.2, "completions/mean_length": 1203.6125, "completions/mean_terminated_length": 1203.6125, "completions/min_length": 921.6, "completions/min_terminated_length": 921.6, "entropy": 0.25160735845565796, "epoch": 4.430082256169213, "frac_reward_zero_std": 0.75, "grad_norm": 0.4195053279399872, "learning_rate": 5.8880542767143204e-08, "loss": -0.0016, "num_tokens": 516760993.0, "reward": 0.8256250143051147, "reward_std": 0.05763051435351372, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8256250023841858, "rewards/e2e_recall_precision_mixed_reward/std": 0.24419592916965485, "sampling/importance_sampling_ratio/max": 1.9390623331069947, "sampling/importance_sampling_ratio/mean": 1.0000794172286986, "sampling/importance_sampling_ratio/min": 0.40482619404792786, "sampling/sampling_logp_difference/max": 0.967167592048645, "sampling/sampling_logp_difference/mean": 0.01277566682547331, "step": 3770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1495.4, "completions/max_terminated_length": 1495.4, "completions/mean_length": 1146.275, "completions/mean_terminated_length": 1146.275, "completions/min_length": 888.0, "completions/min_terminated_length": 888.0, "entropy": 0.2464199036359787, "epoch": 4.435957696827262, "frac_reward_zero_std": 0.65, "grad_norm": 0.4899135231971741, "learning_rate": 5.827477586624666e-08, "loss": 0.0013, "num_tokens": 517446233.0, "reward": 0.8338541746139526, "reward_std": 0.062453911453485486, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8338541746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.252014647424221, "sampling/importance_sampling_ratio/max": 1.9053905963897706, "sampling/importance_sampling_ratio/mean": 1.0000820279121398, "sampling/importance_sampling_ratio/min": 0.3842323422431946, "sampling/sampling_logp_difference/max": 0.9629977226257325, "sampling/sampling_logp_difference/mean": 0.01282045040279627, "step": 3775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1525.8, "completions/max_terminated_length": 1525.8, "completions/mean_length": 1221.778125, "completions/mean_terminated_length": 1221.778125, "completions/min_length": 925.4, "completions/min_terminated_length": 925.4, "entropy": 0.25722981095314024, "epoch": 4.441833137485311, "frac_reward_zero_std": 0.5, "grad_norm": 0.7961036562919617, "learning_rate": 5.766900896535013e-08, "loss": 0.0022, "num_tokens": 518156594.0, "reward": 0.7723958492279053, "reward_std": 0.09959375858306885, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7723958492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.22938498258590698, "sampling/importance_sampling_ratio/max": 1.9257365465164185, "sampling/importance_sampling_ratio/mean": 0.9999468803405762, "sampling/importance_sampling_ratio/min": 0.3357783600687981, "sampling/sampling_logp_difference/max": 1.324942374229431, "sampling/sampling_logp_difference/mean": 0.012965224497020245, "step": 3780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1477.6, "completions/max_terminated_length": 1477.6, "completions/mean_length": 1133.83125, "completions/mean_terminated_length": 1133.83125, "completions/min_length": 899.0, "completions/min_terminated_length": 899.0, "entropy": 0.2477620154619217, "epoch": 4.447708578143361, "frac_reward_zero_std": 0.85, "grad_norm": 0.5965957045555115, "learning_rate": 5.706324206445359e-08, "loss": -0.0029, "num_tokens": 518821772.0, "reward": 0.917187511920929, "reward_std": 0.031526891887187956, "rewards/e2e_recall_precision_mixed_reward/mean": 0.917187511920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.13625341057777404, "sampling/importance_sampling_ratio/max": 1.9273170948028564, "sampling/importance_sampling_ratio/mean": 1.0000198364257813, "sampling/importance_sampling_ratio/min": 0.31869285106658934, "sampling/sampling_logp_difference/max": 1.2890705108642577, "sampling/sampling_logp_difference/mean": 0.012815793789923192, "step": 3785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1529.0, "completions/max_terminated_length": 1529.0, "completions/mean_length": 1149.540625, "completions/mean_terminated_length": 1149.540625, "completions/min_length": 882.2, "completions/min_terminated_length": 882.2, "entropy": 0.2463329553604126, "epoch": 4.45358401880141, "frac_reward_zero_std": 0.7, "grad_norm": 0.0, "learning_rate": 5.645747516355706e-08, "loss": 0.0083, "num_tokens": 519511353.0, "reward": 0.9010416746139527, "reward_std": 0.0561327800154686, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9010416746139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.17636755406856536, "sampling/importance_sampling_ratio/max": 1.998112392425537, "sampling/importance_sampling_ratio/mean": 0.9999524116516113, "sampling/importance_sampling_ratio/min": 0.26993586095049976, "sampling/sampling_logp_difference/max": 1.9990779876708984, "sampling/sampling_logp_difference/mean": 0.012809686362743378, "step": 3790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1756.6, "completions/max_terminated_length": 1756.6, "completions/mean_length": 1267.45625, "completions/mean_terminated_length": 1267.45625, "completions/min_length": 894.4, "completions/min_terminated_length": 894.4, "entropy": 0.26598574221134186, "epoch": 4.45945945945946, "frac_reward_zero_std": 0.55, "grad_norm": 0.4366239011287689, "learning_rate": 5.5851708262660525e-08, "loss": -0.004, "num_tokens": 520282155.0, "reward": 0.7437500119209289, "reward_std": 0.06342579536139965, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7437500119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.29738129377365113, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000070607662201, "sampling/importance_sampling_ratio/min": 0.20193365388549864, "sampling/sampling_logp_difference/max": 2.41158607006073, "sampling/sampling_logp_difference/mean": 0.013761808723211288, "step": 3795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1707.8, "completions/max_terminated_length": 1707.8, "completions/mean_length": 1245.2125, "completions/mean_terminated_length": 1245.2125, "completions/min_length": 898.0, "completions/min_terminated_length": 898.0, "entropy": 0.2679412066936493, "epoch": 4.465334900117509, "frac_reward_zero_std": 0.6, "grad_norm": 0.6861710548400879, "learning_rate": 5.524594136176399e-08, "loss": 0.002, "num_tokens": 521014463.0, "reward": 0.8529687523841858, "reward_std": 0.0945423498749733, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8529687523841858, "rewards/e2e_recall_precision_mixed_reward/std": 0.21668245792388915, "sampling/importance_sampling_ratio/max": 1.9160140991210937, "sampling/importance_sampling_ratio/mean": 1.0001230716705323, "sampling/importance_sampling_ratio/min": 0.3523849457502365, "sampling/sampling_logp_difference/max": 1.2402945041656495, "sampling/sampling_logp_difference/mean": 0.013412072695791722, "step": 3800 }, { "epoch": 4.465334900117509, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1611.96, "eval_completions/max_terminated_length": 1611.96, "eval_completions/mean_length": 1146.22375, "eval_completions/mean_terminated_length": 1146.22375, "eval_completions/min_length": 863.76, "eval_completions/min_terminated_length": 863.76, "eval_entropy": 0.262061927318573, "eval_frac_reward_zero_std": 0.63, "eval_loss": 0.0004021058266516775, "eval_num_tokens": 521014463.0, "eval_reward": 0.77735417842865, "eval_reward_std": 0.06804241336882115, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.77735417842865, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.2873810449242592, "eval_runtime": 437.7798, "eval_samples_per_second": 0.228, "eval_sampling/importance_sampling_ratio/max": 1.9648003053665162, "eval_sampling/importance_sampling_ratio/mean": 0.9999938631057739, "eval_sampling/importance_sampling_ratio/min": 0.321398678869009, "eval_sampling/sampling_logp_difference/max": 1.3109819889068604, "eval_sampling/sampling_logp_difference/mean": 0.013326054327189923, "eval_steps_per_second": 0.005, "step": 3800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1800.6, "completions/max_terminated_length": 1800.6, "completions/mean_length": 1244.31875, "completions/mean_terminated_length": 1244.31875, "completions/min_length": 900.0, "completions/min_terminated_length": 900.0, "entropy": 0.2523723661899567, "epoch": 4.471210340775558, "frac_reward_zero_std": 0.4, "grad_norm": 0.6707119345664978, "learning_rate": 5.464017446086745e-08, "loss": 0.0054, "num_tokens": 521734005.0, "reward": 0.8927083492279053, "reward_std": 0.10553528666496277, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8927083492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.19338307827711104, "sampling/importance_sampling_ratio/max": 1.923611617088318, "sampling/importance_sampling_ratio/mean": 1.000162625312805, "sampling/importance_sampling_ratio/min": 0.27145159617066383, "sampling/sampling_logp_difference/max": 1.490595269203186, "sampling/sampling_logp_difference/mean": 0.012991057336330413, "step": 3805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1444.0, "completions/max_terminated_length": 1444.0, "completions/mean_length": 1088.00625, "completions/mean_terminated_length": 1088.00625, "completions/min_length": 807.0, "completions/min_terminated_length": 807.0, "entropy": 0.24044516384601594, "epoch": 4.477085781433607, "frac_reward_zero_std": 0.65, "grad_norm": 0.8339589834213257, "learning_rate": 5.403440755997092e-08, "loss": 0.0051, "num_tokens": 522375063.0, "reward": 0.8260416746139526, "reward_std": 0.06378234401345254, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8260416746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.2462353974580765, "sampling/importance_sampling_ratio/max": 1.935638427734375, "sampling/importance_sampling_ratio/mean": 0.9999543190002441, "sampling/importance_sampling_ratio/min": 0.3494003385305405, "sampling/sampling_logp_difference/max": 1.260498571395874, "sampling/sampling_logp_difference/mean": 0.012606218457221985, "step": 3810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1463.0, "completions/max_terminated_length": 1463.0, "completions/mean_length": 1126.371875, "completions/mean_terminated_length": 1126.371875, "completions/min_length": 837.0, "completions/min_terminated_length": 837.0, "entropy": 0.25203177332878113, "epoch": 4.482961222091657, "frac_reward_zero_std": 0.65, "grad_norm": 0.6170172691345215, "learning_rate": 5.3428640659074383e-08, "loss": 0.0062, "num_tokens": 523071006.0, "reward": 0.7230208396911622, "reward_std": 0.055364441871643064, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7230208396911622, "rewards/e2e_recall_precision_mixed_reward/std": 0.3580518037080765, "sampling/importance_sampling_ratio/max": 1.866661834716797, "sampling/importance_sampling_ratio/mean": 1.0000461101531983, "sampling/importance_sampling_ratio/min": 0.3747582271695137, "sampling/sampling_logp_difference/max": 1.2039991855621337, "sampling/sampling_logp_difference/mean": 0.01318561527878046, "step": 3815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 1722.6, "completions/max_terminated_length": 1710.8, "completions/mean_length": 1216.221875, "completions/mean_terminated_length": 1203.265234375, "completions/min_length": 849.6, "completions/min_terminated_length": 849.6, "entropy": 0.25681395530700685, "epoch": 4.488836662749706, "frac_reward_zero_std": 0.7, "grad_norm": 0.0, "learning_rate": 5.282287375817785e-08, "loss": -0.0054, "num_tokens": 523788389.0, "reward": 0.8151041746139527, "reward_std": 0.055162250995635986, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8151041746139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.26677214801311494, "sampling/importance_sampling_ratio/max": 1.8846506595611572, "sampling/importance_sampling_ratio/mean": 0.9999499320983887, "sampling/importance_sampling_ratio/min": 0.24901417940855025, "sampling/sampling_logp_difference/max": 1.526833724975586, "sampling/sampling_logp_difference/mean": 0.013307036273181439, "step": 3820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1553.6, "completions/max_terminated_length": 1553.6, "completions/mean_length": 1177.953125, "completions/mean_terminated_length": 1177.953125, "completions/min_length": 902.4, "completions/min_terminated_length": 902.4, "entropy": 0.2598425537347794, "epoch": 4.494712103407756, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "learning_rate": 5.221710685728132e-08, "loss": -0.0017, "num_tokens": 524476582.0, "reward": 0.8104166746139526, "reward_std": 0.03926374763250351, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8104166746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.24043528139591216, "sampling/importance_sampling_ratio/max": 1.9260545253753663, "sampling/importance_sampling_ratio/mean": 1.0000295519828797, "sampling/importance_sampling_ratio/min": 0.31765392124652864, "sampling/sampling_logp_difference/max": 1.1738762140274048, "sampling/sampling_logp_difference/mean": 0.012885869853198529, "step": 3825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1537.2, "completions/max_terminated_length": 1537.2, "completions/mean_length": 1190.83125, "completions/mean_terminated_length": 1190.83125, "completions/min_length": 940.0, "completions/min_terminated_length": 940.0, "entropy": 0.2721462845802307, "epoch": 4.500587544065805, "frac_reward_zero_std": 0.6, "grad_norm": 0.0, "learning_rate": 5.161133995638478e-08, "loss": -0.0019, "num_tokens": 525207200.0, "reward": 0.8614583492279053, "reward_std": 0.06717992275953293, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8614583492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.19824496507644654, "sampling/importance_sampling_ratio/max": 1.9055607557296752, "sampling/importance_sampling_ratio/mean": 0.9999924659729004, "sampling/importance_sampling_ratio/min": 0.30736014246940613, "sampling/sampling_logp_difference/max": 1.254793071746826, "sampling/sampling_logp_difference/mean": 0.013739836029708385, "step": 3830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1508.6, "completions/max_terminated_length": 1508.6, "completions/mean_length": 1160.54375, "completions/mean_terminated_length": 1160.54375, "completions/min_length": 906.4, "completions/min_terminated_length": 906.4, "entropy": 0.24823629558086396, "epoch": 4.506462984723854, "frac_reward_zero_std": 0.85, "grad_norm": 0.0, "learning_rate": 5.100557305548825e-08, "loss": -0.0014, "num_tokens": 525878430.0, "reward": 0.8552083373069763, "reward_std": 0.04184042811393738, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8552083373069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.1641036868095398, "sampling/importance_sampling_ratio/max": 1.9360382080078125, "sampling/importance_sampling_ratio/mean": 1.0001091837882996, "sampling/importance_sampling_ratio/min": 0.435736221075058, "sampling/sampling_logp_difference/max": 0.8629006385803223, "sampling/sampling_logp_difference/mean": 0.01278580967336893, "step": 3835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1659.2, "completions/max_terminated_length": 1659.2, "completions/mean_length": 1220.75, "completions/mean_terminated_length": 1220.75, "completions/min_length": 908.2, "completions/min_terminated_length": 908.2, "entropy": 0.26801995635032655, "epoch": 4.512338425381904, "frac_reward_zero_std": 0.55, "grad_norm": 0.0, "learning_rate": 5.039980615459171e-08, "loss": 0.0028, "num_tokens": 526583710.0, "reward": 0.8401041746139526, "reward_std": 0.07929302006959915, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8401041746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.22597428858280183, "sampling/importance_sampling_ratio/max": 1.9008899450302124, "sampling/importance_sampling_ratio/mean": 0.9999897599220275, "sampling/importance_sampling_ratio/min": 0.36771447360515597, "sampling/sampling_logp_difference/max": 1.1116267204284669, "sampling/sampling_logp_difference/mean": 0.013435482792556287, "step": 3840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1499.2, "completions/max_terminated_length": 1499.2, "completions/mean_length": 1125.628125, "completions/mean_terminated_length": 1125.628125, "completions/min_length": 837.6, "completions/min_terminated_length": 837.6, "entropy": 0.24567932784557342, "epoch": 4.518213866039953, "frac_reward_zero_std": 0.75, "grad_norm": 0.3865836262702942, "learning_rate": 4.979403925369518e-08, "loss": -0.0024, "num_tokens": 527265495.0, "reward": 0.9781250119209289, "reward_std": 0.039643457531929015, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9781250119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.0770312175154686, "sampling/importance_sampling_ratio/max": 1.9671642780303955, "sampling/importance_sampling_ratio/mean": 0.9999995350837707, "sampling/importance_sampling_ratio/min": 0.29105359613895415, "sampling/sampling_logp_difference/max": 1.2808002710342408, "sampling/sampling_logp_difference/mean": 0.01274335365742445, "step": 3845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1580.4, "completions/max_terminated_length": 1580.4, "completions/mean_length": 1136.375, "completions/mean_terminated_length": 1136.375, "completions/min_length": 814.2, "completions/min_terminated_length": 814.2, "entropy": 0.2531812101602554, "epoch": 4.524089306698002, "frac_reward_zero_std": 0.8, "grad_norm": 0.0, "learning_rate": 4.918827235279864e-08, "loss": 0.0018, "num_tokens": 527932655.0, "reward": 0.8760416746139527, "reward_std": 0.03966931700706482, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8760416746139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.2241850331425667, "sampling/importance_sampling_ratio/max": 1.9524729251861572, "sampling/importance_sampling_ratio/mean": 0.9998790860176087, "sampling/importance_sampling_ratio/min": 0.35629400610923767, "sampling/sampling_logp_difference/max": 1.0961596488952636, "sampling/sampling_logp_difference/mean": 0.012896392308175565, "step": 3850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1752.4, "completions/max_terminated_length": 1752.4, "completions/mean_length": 1215.825, "completions/mean_terminated_length": 1215.825, "completions/min_length": 906.6, "completions/min_terminated_length": 906.6, "entropy": 0.2722884476184845, "epoch": 4.529964747356051, "frac_reward_zero_std": 0.65, "grad_norm": 0.7329661846160889, "learning_rate": 4.8582505451902106e-08, "loss": 0.0008, "num_tokens": 528649975.0, "reward": 0.9338541746139526, "reward_std": 0.06870361119508743, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9338541746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.14670785665512084, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000779271125793, "sampling/importance_sampling_ratio/min": 0.3969003438949585, "sampling/sampling_logp_difference/max": 1.0312414169311523, "sampling/sampling_logp_difference/mean": 0.013571283221244812, "step": 3855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1433.0, "completions/max_terminated_length": 1433.0, "completions/mean_length": 1156.028125, "completions/mean_terminated_length": 1156.028125, "completions/min_length": 884.6, "completions/min_terminated_length": 884.6, "entropy": 0.25865795016288756, "epoch": 4.535840188014101, "frac_reward_zero_std": 0.7, "grad_norm": 0.42414742708206177, "learning_rate": 4.797673855100557e-08, "loss": -0.0011, "num_tokens": 529328304.0, "reward": 0.8830729484558105, "reward_std": 0.0488592691719532, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8830729484558105, "rewards/e2e_recall_precision_mixed_reward/std": 0.19041550755500794, "sampling/importance_sampling_ratio/max": 1.9730222463607787, "sampling/importance_sampling_ratio/mean": 1.0000560283660889, "sampling/importance_sampling_ratio/min": 0.45640029907226565, "sampling/sampling_logp_difference/max": 0.8613581180572509, "sampling/sampling_logp_difference/mean": 0.012920542247593403, "step": 3860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1729.0, "completions/max_terminated_length": 1729.0, "completions/mean_length": 1203.79375, "completions/mean_terminated_length": 1203.79375, "completions/min_length": 920.8, "completions/min_terminated_length": 920.8, "entropy": 0.25510145127773287, "epoch": 4.541715628672151, "frac_reward_zero_std": 0.8, "grad_norm": 0.8556481003761292, "learning_rate": 4.737097165010904e-08, "loss": -0.0001, "num_tokens": 530035038.0, "reward": 0.8848958373069763, "reward_std": 0.050856249034404756, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8848958373069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.17340871691703796, "sampling/importance_sampling_ratio/max": 1.9588409900665282, "sampling/importance_sampling_ratio/mean": 0.9999428868293763, "sampling/importance_sampling_ratio/min": 0.3276062309741974, "sampling/sampling_logp_difference/max": 1.2199375867843627, "sampling/sampling_logp_difference/mean": 0.0131689066067338, "step": 3865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1652.8, "completions/max_terminated_length": 1652.8, "completions/mean_length": 1153.7875, "completions/mean_terminated_length": 1153.7875, "completions/min_length": 906.8, "completions/min_terminated_length": 906.8, "entropy": 0.2554138332605362, "epoch": 4.5475910693302, "frac_reward_zero_std": 0.65, "grad_norm": 0.6005891561508179, "learning_rate": 4.67652047492125e-08, "loss": 0.0004, "num_tokens": 530738186.0, "reward": 0.7763020992279053, "reward_std": 0.05372370630502701, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7763020992279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.28807159811258315, "sampling/importance_sampling_ratio/max": 1.9729528188705445, "sampling/importance_sampling_ratio/mean": 1.0000159978866576, "sampling/importance_sampling_ratio/min": 0.34084552526474, "sampling/sampling_logp_difference/max": 1.1215097904205322, "sampling/sampling_logp_difference/mean": 0.01334170252084732, "step": 3870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1798.8, "completions/max_terminated_length": 1686.6, "completions/mean_length": 1217.609375, "completions/mean_terminated_length": 1213.7258544921874, "completions/min_length": 912.2, "completions/min_terminated_length": 912.2, "entropy": 0.2649496465921402, "epoch": 4.553466509988249, "frac_reward_zero_std": 0.55, "grad_norm": 0.11833023279905319, "learning_rate": 4.615943784831597e-08, "loss": -0.0126, "num_tokens": 531457721.0, "reward": 0.8276041865348815, "reward_std": 0.09151726067066193, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8276041865348815, "rewards/e2e_recall_precision_mixed_reward/std": 0.24235431849956512, "sampling/importance_sampling_ratio/max": 1.9542683362960815, "sampling/importance_sampling_ratio/mean": 1.0000524044036865, "sampling/importance_sampling_ratio/min": 0.42125728726387024, "sampling/sampling_logp_difference/max": 0.8714985370635986, "sampling/sampling_logp_difference/mean": 0.013429945148527623, "step": 3875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1519.8, "completions/max_terminated_length": 1519.8, "completions/mean_length": 1170.1625, "completions/mean_terminated_length": 1170.1625, "completions/min_length": 875.0, "completions/min_terminated_length": 875.0, "entropy": 0.2552923381328583, "epoch": 4.559341950646298, "frac_reward_zero_std": 0.65, "grad_norm": 0.5462660193443298, "learning_rate": 4.555367094741943e-08, "loss": 0.0003, "num_tokens": 532149821.0, "reward": 0.8942708373069763, "reward_std": 0.05812600329518318, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8942708373069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.1705754280090332, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000134825706481, "sampling/importance_sampling_ratio/min": 0.3502050653100014, "sampling/sampling_logp_difference/max": 1.247010850906372, "sampling/sampling_logp_difference/mean": 0.012968187779188156, "step": 3880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1731.6, "completions/max_terminated_length": 1731.6, "completions/mean_length": 1178.2, "completions/mean_terminated_length": 1178.2, "completions/min_length": 800.0, "completions/min_terminated_length": 800.0, "entropy": 0.2585215061903, "epoch": 4.565217391304348, "frac_reward_zero_std": 0.65, "grad_norm": 0.5714375376701355, "learning_rate": 4.4947904046522897e-08, "loss": -0.0019, "num_tokens": 532852845.0, "reward": 0.8404687643051147, "reward_std": 0.06901195198297501, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8404687643051147, "rewards/e2e_recall_precision_mixed_reward/std": 0.24585144221782684, "sampling/importance_sampling_ratio/max": 1.974919819831848, "sampling/importance_sampling_ratio/mean": 0.999848234653473, "sampling/importance_sampling_ratio/min": 0.2790116786956787, "sampling/sampling_logp_difference/max": 1.7504661321640014, "sampling/sampling_logp_difference/mean": 0.013239697739481925, "step": 3885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1500.2, "completions/max_terminated_length": 1500.2, "completions/mean_length": 1135.875, "completions/mean_terminated_length": 1135.875, "completions/min_length": 773.6, "completions/min_terminated_length": 773.6, "entropy": 0.2818588376045227, "epoch": 4.571092831962397, "frac_reward_zero_std": 0.8, "grad_norm": 0.4970671236515045, "learning_rate": 4.434213714562636e-08, "loss": -0.0024, "num_tokens": 533555349.0, "reward": 0.8330729246139527, "reward_std": 0.038709495961666104, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8330729246139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.30427423417568206, "sampling/importance_sampling_ratio/max": 1.9636075735092162, "sampling/importance_sampling_ratio/mean": 1.0000600218772888, "sampling/importance_sampling_ratio/min": 0.4317789852619171, "sampling/sampling_logp_difference/max": 0.952790904045105, "sampling/sampling_logp_difference/mean": 0.014102857001125813, "step": 3890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1470.2, "completions/max_terminated_length": 1470.2, "completions/mean_length": 1135.459375, "completions/mean_terminated_length": 1135.459375, "completions/min_length": 813.0, "completions/min_terminated_length": 813.0, "entropy": 0.2717915177345276, "epoch": 4.576968272620446, "frac_reward_zero_std": 0.8, "grad_norm": 0.37997737526893616, "learning_rate": 4.373637024472983e-08, "loss": -0.0001, "num_tokens": 534205528.0, "reward": 0.9541666746139527, "reward_std": 0.03333333283662796, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9541666746139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.09064806997776031, "sampling/importance_sampling_ratio/max": 1.943871831893921, "sampling/importance_sampling_ratio/mean": 0.9999991655349731, "sampling/importance_sampling_ratio/min": 0.33060679733753207, "sampling/sampling_logp_difference/max": 1.4704230308532715, "sampling/sampling_logp_difference/mean": 0.01357163693755865, "step": 3895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1652.8, "completions/max_terminated_length": 1652.8, "completions/mean_length": 1220.6125, "completions/mean_terminated_length": 1220.6125, "completions/min_length": 940.2, "completions/min_terminated_length": 940.2, "entropy": 0.26435089111328125, "epoch": 4.582843713278496, "frac_reward_zero_std": 0.7, "grad_norm": 0.9621942639350891, "learning_rate": 4.313060334383329e-08, "loss": 0.0041, "num_tokens": 534929196.0, "reward": 0.9328125238418579, "reward_std": 0.07412366569042206, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9328125238418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.15820899307727815, "sampling/importance_sampling_ratio/max": 1.9864572286605835, "sampling/importance_sampling_ratio/mean": 0.9999479293823242, "sampling/importance_sampling_ratio/min": 0.278659051656725, "sampling/sampling_logp_difference/max": 7.299841666221619, "sampling/sampling_logp_difference/mean": 0.013311752490699292, "step": 3900 }, { "epoch": 4.582843713278496, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1600.16, "eval_completions/max_terminated_length": 1600.16, "eval_completions/mean_length": 1158.768125, "eval_completions/mean_terminated_length": 1158.768125, "eval_completions/min_length": 863.52, "eval_completions/min_terminated_length": 863.52, "eval_entropy": 0.2665262085199356, "eval_frac_reward_zero_std": 0.6, "eval_loss": 0.0024516424164175987, "eval_num_tokens": 534929196.0, "eval_reward": 0.7693125116825104, "eval_reward_std": 0.07587137021124363, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7693125116825104, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.29620310723781584, "eval_runtime": 435.3245, "eval_samples_per_second": 0.23, "eval_sampling/importance_sampling_ratio/max": 1.935142192840576, "eval_sampling/importance_sampling_ratio/mean": 0.9999475979804993, "eval_sampling/importance_sampling_ratio/min": 0.30313371320943816, "eval_sampling/sampling_logp_difference/max": 2.0260102558135986, "eval_sampling/sampling_logp_difference/mean": 0.013469803594052792, "eval_steps_per_second": 0.005, "step": 3900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1640.6, "completions/max_terminated_length": 1640.6, "completions/mean_length": 1170.825, "completions/mean_terminated_length": 1170.825, "completions/min_length": 838.4, "completions/min_terminated_length": 838.4, "entropy": 0.24506229162216187, "epoch": 4.5887191539365455, "frac_reward_zero_std": 0.65, "grad_norm": 0.7513599991798401, "learning_rate": 4.252483644293676e-08, "loss": -0.0017, "num_tokens": 535622804.0, "reward": 0.9093750238418579, "reward_std": 0.07427767887711526, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9093750238418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.15920108705759048, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001034498214723, "sampling/importance_sampling_ratio/min": 0.3779327243566513, "sampling/sampling_logp_difference/max": 1.4409908533096314, "sampling/sampling_logp_difference/mean": 0.012623549997806549, "step": 3905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1821.8, "completions/max_terminated_length": 1821.8, "completions/mean_length": 1271.396875, "completions/mean_terminated_length": 1271.396875, "completions/min_length": 911.2, "completions/min_terminated_length": 911.2, "entropy": 0.2800646245479584, "epoch": 4.594594594594595, "frac_reward_zero_std": 0.5, "grad_norm": 0.6105312705039978, "learning_rate": 4.191906954204022e-08, "loss": 0.0054, "num_tokens": 536353683.0, "reward": 0.8318229198455811, "reward_std": 0.08738780058920384, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8318229198455811, "rewards/e2e_recall_precision_mixed_reward/std": 0.18347244337201118, "sampling/importance_sampling_ratio/max": 1.9310923099517823, "sampling/importance_sampling_ratio/mean": 1.0000660181045533, "sampling/importance_sampling_ratio/min": 0.3126604288816452, "sampling/sampling_logp_difference/max": 1.3532617330551147, "sampling/sampling_logp_difference/mean": 0.013997785560786725, "step": 3910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1372.8, "completions/max_terminated_length": 1372.8, "completions/mean_length": 1049.075, "completions/mean_terminated_length": 1049.075, "completions/min_length": 797.6, "completions/min_terminated_length": 797.6, "entropy": 0.24361443221569062, "epoch": 4.600470035252644, "frac_reward_zero_std": 0.65, "grad_norm": 0.0, "learning_rate": 4.131330264114369e-08, "loss": -0.0028, "num_tokens": 536989403.0, "reward": 0.8885416746139526, "reward_std": 0.04319274723529816, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8885416746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.11913625001907349, "sampling/importance_sampling_ratio/max": 1.9888309478759765, "sampling/importance_sampling_ratio/mean": 1.0000429511070252, "sampling/importance_sampling_ratio/min": 0.3370845317840576, "sampling/sampling_logp_difference/max": 1.1793973922729493, "sampling/sampling_logp_difference/mean": 0.01257583238184452, "step": 3915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1515.2, "completions/max_terminated_length": 1515.2, "completions/mean_length": 1166.315625, "completions/mean_terminated_length": 1166.315625, "completions/min_length": 898.0, "completions/min_terminated_length": 898.0, "entropy": 0.2607916831970215, "epoch": 4.606345475910693, "frac_reward_zero_std": 0.8, "grad_norm": 0.4696587920188904, "learning_rate": 4.070753574024715e-08, "loss": -0.0037, "num_tokens": 537690480.0, "reward": 0.7993229150772094, "reward_std": 0.030344261974096298, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7993229269981384, "rewards/e2e_recall_precision_mixed_reward/std": 0.23416321575641633, "sampling/importance_sampling_ratio/max": 1.9028977155685425, "sampling/importance_sampling_ratio/mean": 1.0000278234481812, "sampling/importance_sampling_ratio/min": 0.294919428229332, "sampling/sampling_logp_difference/max": 1.323739767074585, "sampling/sampling_logp_difference/mean": 0.013191297091543675, "step": 3920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1660.6, "completions/max_terminated_length": 1660.6, "completions/mean_length": 1238.453125, "completions/mean_terminated_length": 1238.453125, "completions/min_length": 955.6, "completions/min_terminated_length": 955.6, "entropy": 0.27204486131668093, "epoch": 4.6122209165687424, "frac_reward_zero_std": 0.65, "grad_norm": 0.0, "learning_rate": 4.010176883935062e-08, "loss": 0.0012, "num_tokens": 538432081.0, "reward": 0.8140625119209289, "reward_std": 0.058893933147192004, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8140625119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.27546189576387403, "sampling/importance_sampling_ratio/max": 1.9770929336547851, "sampling/importance_sampling_ratio/mean": 1.0000423312187194, "sampling/importance_sampling_ratio/min": 0.27728479243814946, "sampling/sampling_logp_difference/max": 1.5920629739761352, "sampling/sampling_logp_difference/mean": 0.013747538067400455, "step": 3925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1446.6, "completions/max_terminated_length": 1446.6, "completions/mean_length": 1153.44375, "completions/mean_terminated_length": 1153.44375, "completions/min_length": 893.2, "completions/min_terminated_length": 893.2, "entropy": 0.25421291291713716, "epoch": 4.618096357226792, "frac_reward_zero_std": 0.7, "grad_norm": 0.0, "learning_rate": 3.949600193845408e-08, "loss": -0.0001, "num_tokens": 539115407.0, "reward": 0.9856771111488343, "reward_std": 0.037527060508728026, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9856771111488343, "rewards/e2e_recall_precision_mixed_reward/std": 0.06466835737228394, "sampling/importance_sampling_ratio/max": 1.9693354606628417, "sampling/importance_sampling_ratio/mean": 0.999970543384552, "sampling/importance_sampling_ratio/min": 0.3985715299844742, "sampling/sampling_logp_difference/max": 1.1402654647827148, "sampling/sampling_logp_difference/mean": 0.013061221688985825, "step": 3930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1721.0, "completions/max_terminated_length": 1721.0, "completions/mean_length": 1277.778125, "completions/mean_terminated_length": 1277.778125, "completions/min_length": 917.0, "completions/min_terminated_length": 917.0, "entropy": 0.2775550276041031, "epoch": 4.623971797884842, "frac_reward_zero_std": 0.7, "grad_norm": 0.3847541809082031, "learning_rate": 3.8890235037557545e-08, "loss": 0.0022, "num_tokens": 539880904.0, "reward": 0.8575520992279053, "reward_std": 0.05469306409358978, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8575520992279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.1839374229311943, "sampling/importance_sampling_ratio/max": 1.9318484306335448, "sampling/importance_sampling_ratio/mean": 0.9999679446220398, "sampling/importance_sampling_ratio/min": 0.2941402941942215, "sampling/sampling_logp_difference/max": 1.2686848640441895, "sampling/sampling_logp_difference/mean": 0.013829667121171951, "step": 3935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1726.0, "completions/max_terminated_length": 1726.0, "completions/mean_length": 1206.425, "completions/mean_terminated_length": 1206.425, "completions/min_length": 811.0, "completions/min_terminated_length": 811.0, "entropy": 0.27049291133880615, "epoch": 4.629847238542891, "frac_reward_zero_std": 0.65, "grad_norm": 0.6516361832618713, "learning_rate": 3.828446813666101e-08, "loss": -0.0076, "num_tokens": 540607264.0, "reward": 0.7458333432674408, "reward_std": 0.07592152431607246, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7458333432674408, "rewards/e2e_recall_precision_mixed_reward/std": 0.2239207148551941, "sampling/importance_sampling_ratio/max": 1.9414830446243285, "sampling/importance_sampling_ratio/mean": 0.9999024152755738, "sampling/importance_sampling_ratio/min": 0.4399965822696686, "sampling/sampling_logp_difference/max": 1.0633353471755982, "sampling/sampling_logp_difference/mean": 0.013744635693728923, "step": 3940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 1705.6, "completions/max_terminated_length": 1683.6, "completions/mean_length": 1243.896875, "completions/mean_terminated_length": 1217.3513427734374, "completions/min_length": 937.6, "completions/min_terminated_length": 937.6, "entropy": 0.27136591672897337, "epoch": 4.63572267920094, "frac_reward_zero_std": 0.6, "grad_norm": 0.45293742418289185, "learning_rate": 3.767870123576448e-08, "loss": -0.0076, "num_tokens": 541313663.0, "reward": 0.737500011920929, "reward_std": 0.07753491625189782, "rewards/e2e_recall_precision_mixed_reward/mean": 0.737500011920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.32985628843307496, "sampling/importance_sampling_ratio/max": 1.9891737461090089, "sampling/importance_sampling_ratio/mean": 1.000024402141571, "sampling/importance_sampling_ratio/min": 0.3064669918268919, "sampling/sampling_logp_difference/max": 1.5447747945785522, "sampling/sampling_logp_difference/mean": 0.013566815294325352, "step": 3945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1630.4, "completions/max_terminated_length": 1630.4, "completions/mean_length": 1191.609375, "completions/mean_terminated_length": 1191.609375, "completions/min_length": 866.6, "completions/min_terminated_length": 866.6, "entropy": 0.24883055090904235, "epoch": 4.6415981198589895, "frac_reward_zero_std": 0.55, "grad_norm": 0.6785134077072144, "learning_rate": 3.707293433486794e-08, "loss": 0.0025, "num_tokens": 542031426.0, "reward": 0.8836979389190673, "reward_std": 0.07515020072460174, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8836979389190673, "rewards/e2e_recall_precision_mixed_reward/std": 0.21068175360560418, "sampling/importance_sampling_ratio/max": 1.997580623626709, "sampling/importance_sampling_ratio/mean": 1.00013267993927, "sampling/importance_sampling_ratio/min": 0.40443000197410583, "sampling/sampling_logp_difference/max": 0.9994537353515625, "sampling/sampling_logp_difference/mean": 0.012894240953028203, "step": 3950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1558.4, "completions/max_terminated_length": 1558.4, "completions/mean_length": 1170.86875, "completions/mean_terminated_length": 1170.86875, "completions/min_length": 851.2, "completions/min_terminated_length": 851.2, "entropy": 0.2621671468019485, "epoch": 4.647473560517039, "frac_reward_zero_std": 0.65, "grad_norm": 0.6207395195960999, "learning_rate": 3.646716743397141e-08, "loss": 0.0014, "num_tokens": 542759560.0, "reward": 0.8729166865348816, "reward_std": 0.059103918820619585, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8729166865348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.19129492193460465, "sampling/importance_sampling_ratio/max": 1.8469502925872803, "sampling/importance_sampling_ratio/mean": 1.000000774860382, "sampling/importance_sampling_ratio/min": 0.3592810183763504, "sampling/sampling_logp_difference/max": 1.0624561548233031, "sampling/sampling_logp_difference/mean": 0.01351084392517805, "step": 3955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1508.0, "completions/max_terminated_length": 1508.0, "completions/mean_length": 1172.725, "completions/mean_terminated_length": 1172.725, "completions/min_length": 919.4, "completions/min_terminated_length": 919.4, "entropy": 0.2633721888065338, "epoch": 4.653349001175088, "frac_reward_zero_std": 0.65, "grad_norm": 0.7144450545310974, "learning_rate": 3.5861400533074866e-08, "loss": -0.0002, "num_tokens": 543453360.0, "reward": 0.8821875095367432, "reward_std": 0.06506949663162231, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8821875095367432, "rewards/e2e_recall_precision_mixed_reward/std": 0.18558357805013656, "sampling/importance_sampling_ratio/max": 1.9161983489990235, "sampling/importance_sampling_ratio/mean": 0.999916136264801, "sampling/importance_sampling_ratio/min": 0.28187123835086825, "sampling/sampling_logp_difference/max": 1.3893019914627076, "sampling/sampling_logp_difference/mean": 0.013280938379466534, "step": 3960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1580.8, "completions/max_terminated_length": 1580.8, "completions/mean_length": 1179.315625, "completions/mean_terminated_length": 1179.315625, "completions/min_length": 882.6, "completions/min_terminated_length": 882.6, "entropy": 0.2579510986804962, "epoch": 4.659224441833137, "frac_reward_zero_std": 0.65, "grad_norm": 0.6650709509849548, "learning_rate": 3.5255633632178335e-08, "loss": 0.0012, "num_tokens": 544191765.0, "reward": 0.848437511920929, "reward_std": 0.05586938187479973, "rewards/e2e_recall_precision_mixed_reward/mean": 0.848437511920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.21678120493888856, "sampling/importance_sampling_ratio/max": 1.9823765516281129, "sampling/importance_sampling_ratio/mean": 0.9999704122543335, "sampling/importance_sampling_ratio/min": 0.3650485098361969, "sampling/sampling_logp_difference/max": 1.1645691871643067, "sampling/sampling_logp_difference/mean": 0.013468297570943833, "step": 3965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1900.4, "completions/max_terminated_length": 1900.4, "completions/mean_length": 1203.55, "completions/mean_terminated_length": 1203.55, "completions/min_length": 880.4, "completions/min_terminated_length": 880.4, "entropy": 0.24793876111507415, "epoch": 4.6650998824911865, "frac_reward_zero_std": 0.55, "grad_norm": 0.5894632339477539, "learning_rate": 3.46498667312818e-08, "loss": 0.0004, "num_tokens": 544872565.0, "reward": 0.7872395992279053, "reward_std": 0.09235174730420112, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7872396111488342, "rewards/e2e_recall_precision_mixed_reward/std": 0.3238639831542969, "sampling/importance_sampling_ratio/max": 1.927569079399109, "sampling/importance_sampling_ratio/mean": 0.9999173641204834, "sampling/importance_sampling_ratio/min": 0.3570482492446899, "sampling/sampling_logp_difference/max": 1.0754551410675048, "sampling/sampling_logp_difference/mean": 0.012769071571528911, "step": 3970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1466.2, "completions/max_terminated_length": 1466.2, "completions/mean_length": 1168.040625, "completions/mean_terminated_length": 1168.040625, "completions/min_length": 926.0, "completions/min_terminated_length": 926.0, "entropy": 0.2611490249633789, "epoch": 4.670975323149237, "frac_reward_zero_std": 0.7, "grad_norm": 0.0, "learning_rate": 3.404409983038527e-08, "loss": 0.0065, "num_tokens": 545571234.0, "reward": 0.8796875238418579, "reward_std": 0.06576394066214561, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8796875238418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.2026699587702751, "sampling/importance_sampling_ratio/max": 1.9442184925079347, "sampling/importance_sampling_ratio/mean": 0.9999972224235535, "sampling/importance_sampling_ratio/min": 0.37336876094341276, "sampling/sampling_logp_difference/max": 1.0985134363174438, "sampling/sampling_logp_difference/mean": 0.013413655199110508, "step": 3975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1692.6, "completions/max_terminated_length": 1692.6, "completions/mean_length": 1231.178125, "completions/mean_terminated_length": 1231.178125, "completions/min_length": 876.6, "completions/min_terminated_length": 876.6, "entropy": 0.2676435440778732, "epoch": 4.676850763807286, "frac_reward_zero_std": 0.6, "grad_norm": 0.8372879028320312, "learning_rate": 3.343833292948873e-08, "loss": 0.0038, "num_tokens": 546334107.0, "reward": 0.8958333492279053, "reward_std": 0.06377268582582474, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8958333492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.21343012303113937, "sampling/importance_sampling_ratio/max": 1.99753577709198, "sampling/importance_sampling_ratio/mean": 0.999961519241333, "sampling/importance_sampling_ratio/min": 0.31758705228567125, "sampling/sampling_logp_difference/max": 1.2507687091827393, "sampling/sampling_logp_difference/mean": 0.013471122644841672, "step": 3980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1752.2, "completions/max_terminated_length": 1752.2, "completions/mean_length": 1251.534375, "completions/mean_terminated_length": 1251.534375, "completions/min_length": 935.4, "completions/min_terminated_length": 935.4, "entropy": 0.2761573553085327, "epoch": 4.682726204465335, "frac_reward_zero_std": 0.8, "grad_norm": 0.5521088242530823, "learning_rate": 3.28325660285922e-08, "loss": 0.0005, "num_tokens": 547056854.0, "reward": 0.9140625, "reward_std": 0.042132827639579776, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9140625, "rewards/e2e_recall_precision_mixed_reward/std": 0.10463754087686539, "sampling/importance_sampling_ratio/max": 1.9920671224594115, "sampling/importance_sampling_ratio/mean": 0.999975323677063, "sampling/importance_sampling_ratio/min": 0.3770316272974014, "sampling/sampling_logp_difference/max": 1.247798991203308, "sampling/sampling_logp_difference/mean": 0.013781622983515263, "step": 3985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1766.0, "completions/max_terminated_length": 1766.0, "completions/mean_length": 1225.1625, "completions/mean_terminated_length": 1225.1625, "completions/min_length": 924.2, "completions/min_terminated_length": 924.2, "entropy": 0.27316820025444033, "epoch": 4.688601645123384, "frac_reward_zero_std": 0.65, "grad_norm": 0.5807452201843262, "learning_rate": 3.2226799127695656e-08, "loss": 0.0042, "num_tokens": 547772058.0, "reward": 0.7897916674613953, "reward_std": 0.05927639305591583, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7897916674613953, "rewards/e2e_recall_precision_mixed_reward/std": 0.26461312770843504, "sampling/importance_sampling_ratio/max": 1.8936434745788575, "sampling/importance_sampling_ratio/mean": 0.9999576568603515, "sampling/importance_sampling_ratio/min": 0.3703816294670105, "sampling/sampling_logp_difference/max": 1.2750481605529784, "sampling/sampling_logp_difference/mean": 0.013870716467499733, "step": 3990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1602.6, "completions/max_terminated_length": 1602.6, "completions/mean_length": 1158.24375, "completions/mean_terminated_length": 1158.24375, "completions/min_length": 877.2, "completions/min_terminated_length": 877.2, "entropy": 0.255287766456604, "epoch": 4.6944770857814335, "frac_reward_zero_std": 0.5, "grad_norm": 0.9467837810516357, "learning_rate": 3.1621032226799126e-08, "loss": 0.0016, "num_tokens": 548446504.0, "reward": 0.909375011920929, "reward_std": 0.0933942548930645, "rewards/e2e_recall_precision_mixed_reward/mean": 0.909375011920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.1927630953490734, "sampling/importance_sampling_ratio/max": 1.9999804496765137, "sampling/importance_sampling_ratio/mean": 1.0001122236251831, "sampling/importance_sampling_ratio/min": 0.35746287405490873, "sampling/sampling_logp_difference/max": 1.3460246801376343, "sampling/sampling_logp_difference/mean": 0.013273773156106472, "step": 3995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1541.6, "completions/max_terminated_length": 1541.6, "completions/mean_length": 1136.003125, "completions/mean_terminated_length": 1136.003125, "completions/min_length": 832.8, "completions/min_terminated_length": 832.8, "entropy": 0.25159532129764556, "epoch": 4.700352526439483, "frac_reward_zero_std": 0.45, "grad_norm": 0.8297567367553711, "learning_rate": 3.101526532590259e-08, "loss": 0.0062, "num_tokens": 549096761.0, "reward": 0.8660937666893005, "reward_std": 0.10318772792816162, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8660937666893005, "rewards/e2e_recall_precision_mixed_reward/std": 0.22221954464912413, "sampling/importance_sampling_ratio/max": 1.9245586156845094, "sampling/importance_sampling_ratio/mean": 1.0000725388526917, "sampling/importance_sampling_ratio/min": 0.32359273731708527, "sampling/sampling_logp_difference/max": 1.1840140581130982, "sampling/sampling_logp_difference/mean": 0.012639879994094372, "step": 4000 }, { "epoch": 4.700352526439483, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1620.08, "eval_completions/max_terminated_length": 1620.08, "eval_completions/mean_length": 1171.0, "eval_completions/mean_terminated_length": 1171.0, "eval_completions/min_length": 876.84, "eval_completions/min_terminated_length": 876.84, "eval_entropy": 0.2678144866228104, "eval_frac_reward_zero_std": 0.61, "eval_loss": 0.002559106098487973, "eval_num_tokens": 549096761.0, "eval_reward": 0.7812708473205566, "eval_reward_std": 0.07859123006463051, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7812708473205566, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.2977286207675934, "eval_runtime": 445.3753, "eval_samples_per_second": 0.225, "eval_sampling/importance_sampling_ratio/max": 1.9588973712921143, "eval_sampling/importance_sampling_ratio/mean": 1.000038492679596, "eval_sampling/importance_sampling_ratio/min": 0.37385270178318025, "eval_sampling/sampling_logp_difference/max": 1.1236275243759155, "eval_sampling/sampling_logp_difference/mean": 0.013486710004508495, "eval_steps_per_second": 0.004, "step": 4000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1669.4, "completions/max_terminated_length": 1669.4, "completions/mean_length": 1135.534375, "completions/mean_terminated_length": 1135.534375, "completions/min_length": 805.4, "completions/min_terminated_length": 805.4, "entropy": 0.2553311824798584, "epoch": 4.706227967097532, "frac_reward_zero_std": 0.6, "grad_norm": 0.40861353278160095, "learning_rate": 3.040949842500606e-08, "loss": 0.0015, "num_tokens": 549800276.0, "reward": 0.806250023841858, "reward_std": 0.07501365020871162, "rewards/e2e_recall_precision_mixed_reward/mean": 0.806250023841858, "rewards/e2e_recall_precision_mixed_reward/std": 0.24988763481378556, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000290989875793, "sampling/importance_sampling_ratio/min": 0.300783509016037, "sampling/sampling_logp_difference/max": 1.3301929354667663, "sampling/sampling_logp_difference/mean": 0.013078136369585991, "step": 4005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1840.6, "completions/max_terminated_length": 1725.8, "completions/mean_length": 1265.096875, "completions/mean_terminated_length": 1261.593212890625, "completions/min_length": 888.2, "completions/min_terminated_length": 888.2, "entropy": 0.2794096082448959, "epoch": 4.712103407755581, "frac_reward_zero_std": 0.6, "grad_norm": 0.541865885257721, "learning_rate": 2.980373152410952e-08, "loss": -0.0058, "num_tokens": 550526655.0, "reward": 0.9447916746139526, "reward_std": 0.07777333706617355, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9447916746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.13151190429925919, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000796794891358, "sampling/importance_sampling_ratio/min": 0.3717313528060913, "sampling/sampling_logp_difference/max": 1.1467663288116454, "sampling/sampling_logp_difference/mean": 0.013969559781253338, "step": 4010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1709.2, "completions/max_terminated_length": 1709.2, "completions/mean_length": 1249.7875, "completions/mean_terminated_length": 1249.7875, "completions/min_length": 963.0, "completions/min_terminated_length": 963.0, "entropy": 0.28808929324150084, "epoch": 4.717978848413631, "frac_reward_zero_std": 0.55, "grad_norm": 0.47011598944664, "learning_rate": 2.9197964623212987e-08, "loss": -0.0053, "num_tokens": 551228443.0, "reward": 0.8390625, "reward_std": 0.05702723562717438, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8390625, "rewards/e2e_recall_precision_mixed_reward/std": 0.24377740174531937, "sampling/importance_sampling_ratio/max": 1.9100152969360351, "sampling/importance_sampling_ratio/mean": 0.9998533844947814, "sampling/importance_sampling_ratio/min": 0.31113848388195037, "sampling/sampling_logp_difference/max": 1.2729925632476806, "sampling/sampling_logp_difference/mean": 0.014077169820666313, "step": 4015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1533.0, "completions/max_terminated_length": 1533.0, "completions/mean_length": 1139.98125, "completions/mean_terminated_length": 1139.98125, "completions/min_length": 857.2, "completions/min_terminated_length": 857.2, "entropy": 0.2606339991092682, "epoch": 4.723854289071681, "frac_reward_zero_std": 0.7, "grad_norm": 0.6212969422340393, "learning_rate": 2.859219772231645e-08, "loss": 0.003, "num_tokens": 551908709.0, "reward": 0.86171875, "reward_std": 0.05891707688570023, "rewards/e2e_recall_precision_mixed_reward/mean": 0.86171875, "rewards/e2e_recall_precision_mixed_reward/std": 0.25346395522356036, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000167965888977, "sampling/importance_sampling_ratio/min": 0.2859252363443375, "sampling/sampling_logp_difference/max": 1.3241569757461549, "sampling/sampling_logp_difference/mean": 0.013379177823662759, "step": 4020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1564.2, "completions/max_terminated_length": 1564.2, "completions/mean_length": 1149.69375, "completions/mean_terminated_length": 1149.69375, "completions/min_length": 864.8, "completions/min_terminated_length": 864.8, "entropy": 0.2674075663089752, "epoch": 4.72972972972973, "frac_reward_zero_std": 0.65, "grad_norm": 0.6235418915748596, "learning_rate": 2.7986430821419916e-08, "loss": -0.0026, "num_tokens": 552624643.0, "reward": 0.7882812559604645, "reward_std": 0.05237821713089943, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7882812559604645, "rewards/e2e_recall_precision_mixed_reward/std": 0.2877985715866089, "sampling/importance_sampling_ratio/max": 1.906586217880249, "sampling/importance_sampling_ratio/mean": 0.9999934434890747, "sampling/importance_sampling_ratio/min": 0.45421077609062194, "sampling/sampling_logp_difference/max": 0.8108912110328674, "sampling/sampling_logp_difference/mean": 0.013526509329676629, "step": 4025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1647.6, "completions/max_terminated_length": 1647.6, "completions/mean_length": 1204.103125, "completions/mean_terminated_length": 1204.103125, "completions/min_length": 949.2, "completions/min_terminated_length": 949.2, "entropy": 0.27815998792648317, "epoch": 4.735605170387779, "frac_reward_zero_std": 0.8, "grad_norm": 0.7050502896308899, "learning_rate": 2.7380663920523383e-08, "loss": 0.0019, "num_tokens": 553323572.0, "reward": 0.8698958396911621, "reward_std": 0.03121014088392258, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8698958396911621, "rewards/e2e_recall_precision_mixed_reward/std": 0.16822029352188111, "sampling/importance_sampling_ratio/max": 1.90050311088562, "sampling/importance_sampling_ratio/mean": 1.000030755996704, "sampling/importance_sampling_ratio/min": 0.3887382984161377, "sampling/sampling_logp_difference/max": 0.9473320245742798, "sampling/sampling_logp_difference/mean": 0.013588633574545383, "step": 4030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1697.4, "completions/max_terminated_length": 1697.4, "completions/mean_length": 1214.74375, "completions/mean_terminated_length": 1214.74375, "completions/min_length": 905.8, "completions/min_terminated_length": 905.8, "entropy": 0.2639480948448181, "epoch": 4.741480611045828, "frac_reward_zero_std": 0.6, "grad_norm": 0.4301367998123169, "learning_rate": 2.6774897019626845e-08, "loss": -0.0019, "num_tokens": 554043042.0, "reward": 0.8942708492279052, "reward_std": 0.08395053669810296, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8942708492279052, "rewards/e2e_recall_precision_mixed_reward/std": 0.21787521839141846, "sampling/importance_sampling_ratio/max": 1.9720707178115844, "sampling/importance_sampling_ratio/mean": 1.0000418543815612, "sampling/importance_sampling_ratio/min": 0.3419287145137787, "sampling/sampling_logp_difference/max": 1.1653326988220214, "sampling/sampling_logp_difference/mean": 0.013334690406918526, "step": 4035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1683.0, "completions/max_terminated_length": 1683.0, "completions/mean_length": 1188.94375, "completions/mean_terminated_length": 1188.94375, "completions/min_length": 851.4, "completions/min_terminated_length": 851.4, "entropy": 0.26343382298946383, "epoch": 4.7473560517038775, "frac_reward_zero_std": 0.45, "grad_norm": 0.6100848317146301, "learning_rate": 2.616913011873031e-08, "loss": 0.0105, "num_tokens": 554709632.0, "reward": 0.8104166865348816, "reward_std": 0.09645045325160026, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8104166865348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.24818702638149262, "sampling/importance_sampling_ratio/max": 1.929994034767151, "sampling/importance_sampling_ratio/mean": 0.9999106764793396, "sampling/importance_sampling_ratio/min": 0.4044054388999939, "sampling/sampling_logp_difference/max": 0.9244142293930053, "sampling/sampling_logp_difference/mean": 0.013344330713152885, "step": 4040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1816.0, "completions/max_terminated_length": 1816.0, "completions/mean_length": 1253.66875, "completions/mean_terminated_length": 1253.66875, "completions/min_length": 878.6, "completions/min_terminated_length": 878.6, "entropy": 0.2727950155735016, "epoch": 4.753231492361927, "frac_reward_zero_std": 0.6, "grad_norm": 0.5697612762451172, "learning_rate": 2.5563363217833778e-08, "loss": -0.0015, "num_tokens": 555431734.0, "reward": 0.7989583492279053, "reward_std": 0.08472683280706406, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7989583492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.3380668729543686, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000714182853698, "sampling/importance_sampling_ratio/min": 0.2469423845410347, "sampling/sampling_logp_difference/max": 1.522918152809143, "sampling/sampling_logp_difference/mean": 0.013825003430247307, "step": 4045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1673.0, "completions/max_terminated_length": 1673.0, "completions/mean_length": 1177.33125, "completions/mean_terminated_length": 1177.33125, "completions/min_length": 785.8, "completions/min_terminated_length": 785.8, "entropy": 0.27231194376945494, "epoch": 4.759106933019977, "frac_reward_zero_std": 0.6, "grad_norm": 0.38864025473594666, "learning_rate": 2.495759631693724e-08, "loss": 0.0024, "num_tokens": 556151040.0, "reward": 0.8744791984558106, "reward_std": 0.0681779682636261, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8744791984558106, "rewards/e2e_recall_precision_mixed_reward/std": 0.20191093385219575, "sampling/importance_sampling_ratio/max": 1.9608511447906494, "sampling/importance_sampling_ratio/mean": 1.0000580906867982, "sampling/importance_sampling_ratio/min": 0.33743279576301577, "sampling/sampling_logp_difference/max": 1.128471803665161, "sampling/sampling_logp_difference/mean": 0.013923012092709542, "step": 4050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1602.0, "completions/max_terminated_length": 1602.0, "completions/mean_length": 1198.00625, "completions/mean_terminated_length": 1198.00625, "completions/min_length": 891.2, "completions/min_terminated_length": 891.2, "entropy": 0.27365310192108155, "epoch": 4.764982373678026, "frac_reward_zero_std": 0.55, "grad_norm": 0.35927513241767883, "learning_rate": 2.4351829416040707e-08, "loss": 0.0013, "num_tokens": 556885026.0, "reward": 0.8122395992279052, "reward_std": 0.10333790630102158, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8122395992279052, "rewards/e2e_recall_precision_mixed_reward/std": 0.2743197739124298, "sampling/importance_sampling_ratio/max": 1.9623225688934327, "sampling/importance_sampling_ratio/mean": 1.000006079673767, "sampling/importance_sampling_ratio/min": 0.36535613536834716, "sampling/sampling_logp_difference/max": 1.1020445585250855, "sampling/sampling_logp_difference/mean": 0.013802669942378998, "step": 4055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1808.0, "completions/max_terminated_length": 1699.6, "completions/mean_length": 1243.16875, "completions/mean_terminated_length": 1239.767333984375, "completions/min_length": 923.4, "completions/min_terminated_length": 923.4, "entropy": 0.2812578797340393, "epoch": 4.770857814336075, "frac_reward_zero_std": 0.6, "grad_norm": 0.6282638311386108, "learning_rate": 2.374606251514417e-08, "loss": -0.0145, "num_tokens": 557614548.0, "reward": 0.8213541626930236, "reward_std": 0.06305044703185558, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8213541746139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.2784075617790222, "sampling/importance_sampling_ratio/max": 1.9655727863311767, "sampling/importance_sampling_ratio/mean": 1.0000412583351135, "sampling/importance_sampling_ratio/min": 0.3901285082101822, "sampling/sampling_logp_difference/max": 1.2020664453506469, "sampling/sampling_logp_difference/mean": 0.01414027102291584, "step": 4060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1658.0, "completions/max_terminated_length": 1658.0, "completions/mean_length": 1201.15, "completions/mean_terminated_length": 1201.15, "completions/min_length": 886.6, "completions/min_terminated_length": 886.6, "entropy": 0.25774570405483244, "epoch": 4.776733254994125, "frac_reward_zero_std": 0.7, "grad_norm": 0.6430007815361023, "learning_rate": 2.3140295614247636e-08, "loss": 0.0008, "num_tokens": 558341524.0, "reward": 0.979687511920929, "reward_std": 0.04776434972882271, "rewards/e2e_recall_precision_mixed_reward/mean": 0.979687511920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.07553751319646836, "sampling/importance_sampling_ratio/max": 1.9550664186477662, "sampling/importance_sampling_ratio/mean": 1.0000919699668884, "sampling/importance_sampling_ratio/min": 0.3034312278032303, "sampling/sampling_logp_difference/max": 1.3194203138351441, "sampling/sampling_logp_difference/mean": 0.013247447088360786, "step": 4065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1630.2, "completions/max_terminated_length": 1630.2, "completions/mean_length": 1149.659375, "completions/mean_terminated_length": 1149.659375, "completions/min_length": 857.8, "completions/min_terminated_length": 857.8, "entropy": 0.26009349822998046, "epoch": 4.782608695652174, "frac_reward_zero_std": 0.6, "grad_norm": 0.5895561575889587, "learning_rate": 2.2534528713351102e-08, "loss": 0.0013, "num_tokens": 559041063.0, "reward": 0.8940104246139526, "reward_std": 0.08193137794733048, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8940104246139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.18850080221891402, "sampling/importance_sampling_ratio/max": 1.9591300249099732, "sampling/importance_sampling_ratio/mean": 0.9999151587486267, "sampling/importance_sampling_ratio/min": 0.2794345647096634, "sampling/sampling_logp_difference/max": 1.3429698705673219, "sampling/sampling_logp_difference/mean": 0.013466096855700016, "step": 4070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1634.0, "completions/max_terminated_length": 1634.0, "completions/mean_length": 1177.80625, "completions/mean_terminated_length": 1177.80625, "completions/min_length": 950.8, "completions/min_terminated_length": 950.8, "entropy": 0.2671573221683502, "epoch": 4.788484136310223, "frac_reward_zero_std": 0.75, "grad_norm": 0.43248382210731506, "learning_rate": 2.1928761812454565e-08, "loss": -0.0063, "num_tokens": 559741417.0, "reward": 0.8859375238418579, "reward_std": 0.05752565562725067, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8859375238418579, "rewards/e2e_recall_precision_mixed_reward/std": 0.2233549490571022, "sampling/importance_sampling_ratio/max": 1.9804691076278687, "sampling/importance_sampling_ratio/mean": 1.0001835584640504, "sampling/importance_sampling_ratio/min": 0.43283228278160096, "sampling/sampling_logp_difference/max": 0.8982840418815613, "sampling/sampling_logp_difference/mean": 0.013595516420900821, "step": 4075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1527.2, "completions/max_terminated_length": 1527.2, "completions/mean_length": 1168.15, "completions/mean_terminated_length": 1168.15, "completions/min_length": 863.0, "completions/min_terminated_length": 863.0, "entropy": 0.2654030591249466, "epoch": 4.794359576968272, "frac_reward_zero_std": 0.6, "grad_norm": 0.6133489012718201, "learning_rate": 2.132299491155803e-08, "loss": 0.0051, "num_tokens": 560460969.0, "reward": 0.8815104365348816, "reward_std": 0.08448518216609954, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8815104365348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.16949040293693543, "sampling/importance_sampling_ratio/max": 1.931166696548462, "sampling/importance_sampling_ratio/mean": 1.0000255107879639, "sampling/importance_sampling_ratio/min": 0.3363965079188347, "sampling/sampling_logp_difference/max": 1.3793178796768188, "sampling/sampling_logp_difference/mean": 0.013709034956991672, "step": 4080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1654.8, "completions/max_terminated_length": 1654.8, "completions/mean_length": 1204.725, "completions/mean_terminated_length": 1204.725, "completions/min_length": 909.8, "completions/min_terminated_length": 909.8, "entropy": 0.2663749247789383, "epoch": 4.800235017626322, "frac_reward_zero_std": 0.7, "grad_norm": 0.6036227941513062, "learning_rate": 2.0717228010661497e-08, "loss": 0.004, "num_tokens": 561160193.0, "reward": 0.9296875, "reward_std": 0.06299812644720078, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9296875, "rewards/e2e_recall_precision_mixed_reward/std": 0.11461469382047654, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000427842140198, "sampling/importance_sampling_ratio/min": 0.29730011597275735, "sampling/sampling_logp_difference/max": 1.5401460886001588, "sampling/sampling_logp_difference/mean": 0.01355595700442791, "step": 4085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1699.2, "completions/max_terminated_length": 1699.2, "completions/mean_length": 1228.4375, "completions/mean_terminated_length": 1228.4375, "completions/min_length": 936.4, "completions/min_terminated_length": 936.4, "entropy": 0.26536994278430937, "epoch": 4.806110458284372, "frac_reward_zero_std": 0.65, "grad_norm": 0.4142984449863434, "learning_rate": 2.011146110976496e-08, "loss": 0.0049, "num_tokens": 561858797.0, "reward": 0.8854166865348816, "reward_std": 0.07202789336442947, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8854166865348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.184259794652462, "sampling/importance_sampling_ratio/max": 1.9609678506851196, "sampling/importance_sampling_ratio/mean": 1.0000285267829896, "sampling/importance_sampling_ratio/min": 0.2662481516599655, "sampling/sampling_logp_difference/max": 1.6673511743545533, "sampling/sampling_logp_difference/mean": 0.01343822181224823, "step": 4090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1498.8, "completions/max_terminated_length": 1498.8, "completions/mean_length": 1137.05625, "completions/mean_terminated_length": 1137.05625, "completions/min_length": 809.4, "completions/min_terminated_length": 809.4, "entropy": 0.265997040271759, "epoch": 4.811985898942421, "frac_reward_zero_std": 0.5, "grad_norm": 0.6153071522712708, "learning_rate": 1.9505694208868426e-08, "loss": -0.0008, "num_tokens": 562600847.0, "reward": 0.8677083492279053, "reward_std": 0.06810193136334419, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8677083492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.23375769779086114, "sampling/importance_sampling_ratio/max": 1.9153061866760255, "sampling/importance_sampling_ratio/mean": 0.9999323844909668, "sampling/importance_sampling_ratio/min": 0.4602263569831848, "sampling/sampling_logp_difference/max": 0.876070213317871, "sampling/sampling_logp_difference/mean": 0.013519445993006229, "step": 4095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1677.6, "completions/max_terminated_length": 1677.6, "completions/mean_length": 1153.478125, "completions/mean_terminated_length": 1153.478125, "completions/min_length": 829.2, "completions/min_terminated_length": 829.2, "entropy": 0.2523133546113968, "epoch": 4.81786133960047, "frac_reward_zero_std": 0.55, "grad_norm": 0.8459767699241638, "learning_rate": 1.889992730797189e-08, "loss": -0.0005, "num_tokens": 563335752.0, "reward": 0.8229166746139527, "reward_std": 0.10837896019220353, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8229166746139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.2653172373771667, "sampling/importance_sampling_ratio/max": 1.9938855409622191, "sampling/importance_sampling_ratio/mean": 1.0000988125801087, "sampling/importance_sampling_ratio/min": 0.3662609428167343, "sampling/sampling_logp_difference/max": 1.1286290645599366, "sampling/sampling_logp_difference/mean": 0.013645793497562408, "step": 4100 }, { "epoch": 4.81786133960047, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1599.92, "eval_completions/max_terminated_length": 1599.92, "eval_completions/mean_length": 1176.245625, "eval_completions/mean_terminated_length": 1176.245625, "eval_completions/min_length": 859.8, "eval_completions/min_terminated_length": 859.8, "eval_entropy": 0.2675222271680832, "eval_frac_reward_zero_std": 0.61, "eval_loss": 0.0027719642966985703, "eval_num_tokens": 563335752.0, "eval_reward": 0.7684583449363709, "eval_reward_std": 0.08029463842511177, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7684583449363709, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.3004398235678673, "eval_runtime": 443.3249, "eval_samples_per_second": 0.226, "eval_sampling/importance_sampling_ratio/max": 1.9567827081680298, "eval_sampling/importance_sampling_ratio/mean": 1.0000646901130676, "eval_sampling/importance_sampling_ratio/min": 0.33781426847563223, "eval_sampling/sampling_logp_difference/max": 2.017117133140564, "eval_sampling/sampling_logp_difference/mean": 0.01359828345477581, "eval_steps_per_second": 0.005, "step": 4100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1556.4, "completions/max_terminated_length": 1556.4, "completions/mean_length": 1202.3125, "completions/mean_terminated_length": 1202.3125, "completions/min_length": 947.2, "completions/min_terminated_length": 947.2, "entropy": 0.24352549016475677, "epoch": 4.823736780258519, "frac_reward_zero_std": 0.65, "grad_norm": 0.39156123995780945, "learning_rate": 1.8294160407075355e-08, "loss": 0.0027, "num_tokens": 564054156.0, "reward": 0.7713541746139526, "reward_std": 0.06558299511671066, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7713541746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.2723333746194839, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000266551971435, "sampling/importance_sampling_ratio/min": 0.2960414350032806, "sampling/sampling_logp_difference/max": 1.4837098360061645, "sampling/sampling_logp_difference/mean": 0.0127076530829072, "step": 4105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1613.4, "completions/max_terminated_length": 1613.4, "completions/mean_length": 1188.05625, "completions/mean_terminated_length": 1188.05625, "completions/min_length": 928.0, "completions/min_terminated_length": 928.0, "entropy": 0.26019893288612367, "epoch": 4.829612220916569, "frac_reward_zero_std": 0.7, "grad_norm": 0.5982215404510498, "learning_rate": 1.768839350617882e-08, "loss": -0.0017, "num_tokens": 564766574.0, "reward": 0.7906771063804626, "reward_std": 0.06684140712022782, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7906771063804626, "rewards/e2e_recall_precision_mixed_reward/std": 0.3481956601142883, "sampling/importance_sampling_ratio/max": 1.9639420986175538, "sampling/importance_sampling_ratio/mean": 0.9999716877937317, "sampling/importance_sampling_ratio/min": 0.21439356505870819, "sampling/sampling_logp_difference/max": 1.8027127265930176, "sampling/sampling_logp_difference/mean": 0.013319421000778675, "step": 4110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1601.8, "completions/max_terminated_length": 1601.8, "completions/mean_length": 1161.0375, "completions/mean_terminated_length": 1161.0375, "completions/min_length": 875.8, "completions/min_terminated_length": 875.8, "entropy": 0.2612924247980118, "epoch": 4.835487661574618, "frac_reward_zero_std": 0.65, "grad_norm": 0.0, "learning_rate": 1.7082626605282284e-08, "loss": -0.0, "num_tokens": 565466730.0, "reward": 0.86796875, "reward_std": 0.08403022512793541, "rewards/e2e_recall_precision_mixed_reward/mean": 0.86796875, "rewards/e2e_recall_precision_mixed_reward/std": 0.1859685465693474, "sampling/importance_sampling_ratio/max": 1.8856915473937987, "sampling/importance_sampling_ratio/mean": 1.0000352263450623, "sampling/importance_sampling_ratio/min": 0.3413155991952294, "sampling/sampling_logp_difference/max": 3.9962790966033936, "sampling/sampling_logp_difference/mean": 0.013213282637298106, "step": 4115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1830.0, "completions/max_terminated_length": 1756.0, "completions/mean_length": 1275.734375, "completions/mean_terminated_length": 1272.3380126953125, "completions/min_length": 923.4, "completions/min_terminated_length": 923.4, "entropy": 0.2845246493816376, "epoch": 4.841363102232667, "frac_reward_zero_std": 0.65, "grad_norm": 0.4948183298110962, "learning_rate": 1.647685970438575e-08, "loss": -0.0046, "num_tokens": 566217121.0, "reward": 0.8736979246139527, "reward_std": 0.05216329097747803, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8736979246139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.22361454963684083, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000300645828246, "sampling/importance_sampling_ratio/min": 0.3315484285354614, "sampling/sampling_logp_difference/max": 1.25800461769104, "sampling/sampling_logp_difference/mean": 0.014182769320905208, "step": 4120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1612.6, "completions/max_terminated_length": 1612.6, "completions/mean_length": 1163.034375, "completions/mean_terminated_length": 1163.034375, "completions/min_length": 870.0, "completions/min_terminated_length": 870.0, "entropy": 0.26198195815086367, "epoch": 4.847238542890716, "frac_reward_zero_std": 0.7, "grad_norm": 0.4309649169445038, "learning_rate": 1.5871092803489217e-08, "loss": 0.0025, "num_tokens": 566941612.0, "reward": 0.7827604353427887, "reward_std": 0.05432678535580635, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7827604353427887, "rewards/e2e_recall_precision_mixed_reward/std": 0.24770253524184227, "sampling/importance_sampling_ratio/max": 1.9743736267089844, "sampling/importance_sampling_ratio/mean": 0.9999311327934265, "sampling/importance_sampling_ratio/min": 0.17661737089511007, "sampling/sampling_logp_difference/max": 2.602937865257263, "sampling/sampling_logp_difference/mean": 0.013681701943278312, "step": 4125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1707.6, "completions/max_terminated_length": 1707.6, "completions/mean_length": 1223.121875, "completions/mean_terminated_length": 1223.121875, "completions/min_length": 950.2, "completions/min_terminated_length": 950.2, "entropy": 0.25878497362136843, "epoch": 4.853113983548766, "frac_reward_zero_std": 0.8, "grad_norm": 0.5751305222511292, "learning_rate": 1.5265325902592683e-08, "loss": -0.0006, "num_tokens": 567651027.0, "reward": 0.9020833373069763, "reward_std": 0.031220474932342767, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9020833373069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.17570360600948334, "sampling/importance_sampling_ratio/max": 1.9646418809890747, "sampling/importance_sampling_ratio/mean": 1.000183892250061, "sampling/importance_sampling_ratio/min": 0.319706991314888, "sampling/sampling_logp_difference/max": 1.356735897064209, "sampling/sampling_logp_difference/mean": 0.013199831359088421, "step": 4130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1738.2, "completions/max_terminated_length": 1738.2, "completions/mean_length": 1208.278125, "completions/mean_terminated_length": 1208.278125, "completions/min_length": 829.0, "completions/min_terminated_length": 829.0, "entropy": 0.25107733011245725, "epoch": 4.858989424206816, "frac_reward_zero_std": 0.6, "grad_norm": 0.6848601698875427, "learning_rate": 1.4659559001696147e-08, "loss": -0.0032, "num_tokens": 568384108.0, "reward": 0.8515625119209289, "reward_std": 0.06289163380861282, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8515625119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.270400308072567, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002090692520142, "sampling/importance_sampling_ratio/min": 0.3987681746482849, "sampling/sampling_logp_difference/max": 1.137919044494629, "sampling/sampling_logp_difference/mean": 0.012771274335682392, "step": 4135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1655.8, "completions/max_terminated_length": 1655.8, "completions/mean_length": 1167.203125, "completions/mean_terminated_length": 1167.203125, "completions/min_length": 904.4, "completions/min_terminated_length": 904.4, "entropy": 0.26398766040802, "epoch": 4.864864864864865, "frac_reward_zero_std": 0.55, "grad_norm": 0.0, "learning_rate": 1.4053792100799612e-08, "loss": 0.0105, "num_tokens": 569108429.0, "reward": 0.7697916865348816, "reward_std": 0.0878813236951828, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7697916865348816, "rewards/e2e_recall_precision_mixed_reward/std": 0.3016296923160553, "sampling/importance_sampling_ratio/max": 1.8767553567886353, "sampling/importance_sampling_ratio/mean": 0.9999534130096436, "sampling/importance_sampling_ratio/min": 0.3066461071372032, "sampling/sampling_logp_difference/max": 1.3338868379592896, "sampling/sampling_logp_difference/mean": 0.013632268644869328, "step": 4140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1630.6, "completions/max_terminated_length": 1630.6, "completions/mean_length": 1170.74375, "completions/mean_terminated_length": 1170.74375, "completions/min_length": 885.8, "completions/min_terminated_length": 885.8, "entropy": 0.24844418168067933, "epoch": 4.870740305522914, "frac_reward_zero_std": 0.75, "grad_norm": 0.5103340148925781, "learning_rate": 1.3448025199903076e-08, "loss": 0.0057, "num_tokens": 569776331.0, "reward": 0.9598958373069764, "reward_std": 0.04087250307202339, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9598958373069764, "rewards/e2e_recall_precision_mixed_reward/std": 0.11710455864667893, "sampling/importance_sampling_ratio/max": 1.9135148763656615, "sampling/importance_sampling_ratio/mean": 1.0000514268875123, "sampling/importance_sampling_ratio/min": 0.3595329821109772, "sampling/sampling_logp_difference/max": 1.1677767515182496, "sampling/sampling_logp_difference/mean": 0.012815902382135392, "step": 4145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1628.6, "completions/max_terminated_length": 1628.6, "completions/mean_length": 1177.821875, "completions/mean_terminated_length": 1177.821875, "completions/min_length": 863.4, "completions/min_terminated_length": 863.4, "entropy": 0.2565174579620361, "epoch": 4.876615746180963, "frac_reward_zero_std": 0.65, "grad_norm": 0.612808108329773, "learning_rate": 1.2842258299006541e-08, "loss": -0.0011, "num_tokens": 570475090.0, "reward": 0.8589062571525574, "reward_std": 0.06039836704730987, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8589062571525574, "rewards/e2e_recall_precision_mixed_reward/std": 0.24054351449012756, "sampling/importance_sampling_ratio/max": 1.9100402116775512, "sampling/importance_sampling_ratio/mean": 0.9999492049217225, "sampling/importance_sampling_ratio/min": 0.3987319231033325, "sampling/sampling_logp_difference/max": 1.0621936082839967, "sampling/sampling_logp_difference/mean": 0.013036507740616798, "step": 4150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1603.8, "completions/max_terminated_length": 1603.8, "completions/mean_length": 1136.215625, "completions/mean_terminated_length": 1136.215625, "completions/min_length": 834.0, "completions/min_terminated_length": 834.0, "entropy": 0.2555226027965546, "epoch": 4.882491186839013, "frac_reward_zero_std": 0.7, "grad_norm": 0.4172721207141876, "learning_rate": 1.2236491398110007e-08, "loss": 0.0025, "num_tokens": 571161863.0, "reward": 0.9296875, "reward_std": 0.056744667887687686, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9296875, "rewards/e2e_recall_precision_mixed_reward/std": 0.1651596575975418, "sampling/importance_sampling_ratio/max": 1.9613843202590941, "sampling/importance_sampling_ratio/mean": 1.000058114528656, "sampling/importance_sampling_ratio/min": 0.2753281805664301, "sampling/sampling_logp_difference/max": 1.6068052053451538, "sampling/sampling_logp_difference/mean": 0.013144350983202458, "step": 4155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1938.4, "completions/max_terminated_length": 1938.4, "completions/mean_length": 1309.365625, "completions/mean_terminated_length": 1309.365625, "completions/min_length": 966.0, "completions/min_terminated_length": 966.0, "entropy": 0.2880063831806183, "epoch": 4.888366627497062, "frac_reward_zero_std": 0.6, "grad_norm": 0.5996348857879639, "learning_rate": 1.1630724497213472e-08, "loss": 0.0025, "num_tokens": 571925932.0, "reward": 0.785937511920929, "reward_std": 0.10168065577745437, "rewards/e2e_recall_precision_mixed_reward/mean": 0.785937511920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.33307317793369295, "sampling/importance_sampling_ratio/max": 1.990170383453369, "sampling/importance_sampling_ratio/mean": 1.0000520944595337, "sampling/importance_sampling_ratio/min": 0.3627611517906189, "sampling/sampling_logp_difference/max": 1.0843160629272461, "sampling/sampling_logp_difference/mean": 0.014332829415798188, "step": 4160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1756.4, "completions/max_terminated_length": 1756.4, "completions/mean_length": 1272.540625, "completions/mean_terminated_length": 1272.540625, "completions/min_length": 909.8, "completions/min_terminated_length": 909.8, "entropy": 0.28079177141189576, "epoch": 4.894242068155112, "frac_reward_zero_std": 0.7, "grad_norm": 0.0, "learning_rate": 1.1024957596316936e-08, "loss": -0.0013, "num_tokens": 572682153.0, "reward": 0.7119791746139527, "reward_std": 0.06243942677974701, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7119791746139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.3010064959526062, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999913239479065, "sampling/importance_sampling_ratio/min": 0.24750588452134253, "sampling/sampling_logp_difference/max": 3.2225181341171263, "sampling/sampling_logp_difference/mean": 0.014154410175979137, "step": 4165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1553.8, "completions/max_terminated_length": 1553.8, "completions/mean_length": 1216.925, "completions/mean_terminated_length": 1216.925, "completions/min_length": 975.8, "completions/min_terminated_length": 975.8, "entropy": 0.2597874790430069, "epoch": 4.900117508813161, "frac_reward_zero_std": 0.6, "grad_norm": 0.6187815070152283, "learning_rate": 1.04191906954204e-08, "loss": -0.0021, "num_tokens": 573397233.0, "reward": 0.9138020992279052, "reward_std": 0.0694403514266014, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9138020992279052, "rewards/e2e_recall_precision_mixed_reward/std": 0.15716297775506974, "sampling/importance_sampling_ratio/max": 1.9616039276123047, "sampling/importance_sampling_ratio/mean": 1.000029969215393, "sampling/importance_sampling_ratio/min": 0.3573975801467896, "sampling/sampling_logp_difference/max": 1.1084071159362794, "sampling/sampling_logp_difference/mean": 0.013234620355069638, "step": 4170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1742.2, "completions/max_terminated_length": 1742.2, "completions/mean_length": 1258.2375, "completions/mean_terminated_length": 1258.2375, "completions/min_length": 925.2, "completions/min_terminated_length": 925.2, "entropy": 0.27962875962257383, "epoch": 4.9059929494712105, "frac_reward_zero_std": 0.65, "grad_norm": 0.5773242115974426, "learning_rate": 9.813423794523867e-09, "loss": -0.0023, "num_tokens": 574103741.0, "reward": 0.9208333492279053, "reward_std": 0.07986356988549233, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9208333492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.1817111313343048, "sampling/importance_sampling_ratio/max": 1.8720341920852661, "sampling/importance_sampling_ratio/mean": 1.0000491261482238, "sampling/importance_sampling_ratio/min": 0.39683855772018434, "sampling/sampling_logp_difference/max": 0.9288500308990478, "sampling/sampling_logp_difference/mean": 0.013723740726709366, "step": 4175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1788.6, "completions/max_terminated_length": 1780.8, "completions/mean_length": 1224.46875, "completions/mean_terminated_length": 1221.0921875, "completions/min_length": 906.4, "completions/min_terminated_length": 906.4, "entropy": 0.27353797256946566, "epoch": 4.91186839012926, "frac_reward_zero_std": 0.7, "grad_norm": 0.5145543813705444, "learning_rate": 9.207656893627331e-09, "loss": -0.007, "num_tokens": 574809695.0, "reward": 0.7510416746139527, "reward_std": 0.06682446748018264, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7510416746139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.3081136792898178, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999987125396729, "sampling/importance_sampling_ratio/min": 0.3361870855093002, "sampling/sampling_logp_difference/max": 1.3072755098342896, "sampling/sampling_logp_difference/mean": 0.013947272859513759, "step": 4180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1716.6, "completions/max_terminated_length": 1716.6, "completions/mean_length": 1253.65, "completions/mean_terminated_length": 1253.65, "completions/min_length": 966.6, "completions/min_terminated_length": 966.6, "entropy": 0.28181648850440977, "epoch": 4.917743830787309, "frac_reward_zero_std": 0.7, "grad_norm": 0.0, "learning_rate": 8.601889992730796e-09, "loss": 0.002, "num_tokens": 575558047.0, "reward": 0.7856770873069763, "reward_std": 0.03310954719781876, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7856770873069763, "rewards/e2e_recall_precision_mixed_reward/std": 0.2692906141281128, "sampling/importance_sampling_ratio/max": 1.9965229749679565, "sampling/importance_sampling_ratio/mean": 0.9999869465827942, "sampling/importance_sampling_ratio/min": 0.26230895724147557, "sampling/sampling_logp_difference/max": 1.7456336498260498, "sampling/sampling_logp_difference/mean": 0.014158726483583451, "step": 4185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1530.2, "completions/max_terminated_length": 1530.2, "completions/mean_length": 1196.39375, "completions/mean_terminated_length": 1196.39375, "completions/min_length": 923.0, "completions/min_terminated_length": 923.0, "entropy": 0.26490248143672945, "epoch": 4.923619271445358, "frac_reward_zero_std": 0.55, "grad_norm": 0.5220557451248169, "learning_rate": 7.99612309183426e-09, "loss": -0.0034, "num_tokens": 576251005.0, "reward": 0.8921875119209289, "reward_std": 0.06537900567054748, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8921875119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.21443742215633393, "sampling/importance_sampling_ratio/max": 1.888991355895996, "sampling/importance_sampling_ratio/mean": 0.999981415271759, "sampling/importance_sampling_ratio/min": 0.31290013790130616, "sampling/sampling_logp_difference/max": 1.2028279781341553, "sampling/sampling_logp_difference/mean": 0.013373297452926636, "step": 4190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1618.0, "completions/max_terminated_length": 1618.0, "completions/mean_length": 1164.04375, "completions/mean_terminated_length": 1164.04375, "completions/min_length": 917.8, "completions/min_terminated_length": 917.8, "entropy": 0.24597953855991364, "epoch": 4.929494712103407, "frac_reward_zero_std": 0.5, "grad_norm": 0.49730363488197327, "learning_rate": 7.3903561909377266e-09, "loss": 0.0001, "num_tokens": 576939563.0, "reward": 0.7997395873069764, "reward_std": 0.1168543741106987, "rewards/e2e_recall_precision_mixed_reward/mean": 0.7997395873069764, "rewards/e2e_recall_precision_mixed_reward/std": 0.27827770859003065, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000683307647704, "sampling/importance_sampling_ratio/min": 0.37277783155441285, "sampling/sampling_logp_difference/max": 1.186050796508789, "sampling/sampling_logp_difference/mean": 0.01289830356836319, "step": 4195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1618.8, "completions/max_terminated_length": 1618.8, "completions/mean_length": 1225.7625, "completions/mean_terminated_length": 1225.7625, "completions/min_length": 839.8, "completions/min_terminated_length": 839.8, "entropy": 0.2759159684181213, "epoch": 4.9353701527614575, "frac_reward_zero_std": 0.7, "grad_norm": 0.5151962041854858, "learning_rate": 6.784589290041192e-09, "loss": -0.0058, "num_tokens": 577667935.0, "reward": 0.940625011920929, "reward_std": 0.06629720851778984, "rewards/e2e_recall_precision_mixed_reward/mean": 0.940625011920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.14214378148317336, "sampling/importance_sampling_ratio/max": 1.9627459526062012, "sampling/importance_sampling_ratio/mean": 0.9999911308288574, "sampling/importance_sampling_ratio/min": 0.34458776712417605, "sampling/sampling_logp_difference/max": 1.183539056777954, "sampling/sampling_logp_difference/mean": 0.013748652674257756, "step": 4200 }, { "epoch": 4.9353701527614575, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1622.84, "eval_completions/max_terminated_length": 1622.84, "eval_completions/mean_length": 1187.468125, "eval_completions/mean_terminated_length": 1187.468125, "eval_completions/min_length": 892.2, "eval_completions/min_terminated_length": 892.2, "eval_entropy": 0.27012341380119326, "eval_frac_reward_zero_std": 0.65, "eval_loss": 0.0018613528227433562, "eval_num_tokens": 577667935.0, "eval_reward": 0.7749166774749756, "eval_reward_std": 0.07821166217327118, "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7749166774749756, "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.29328397393226624, "eval_runtime": 453.8836, "eval_samples_per_second": 0.22, "eval_sampling/importance_sampling_ratio/max": 1.9473948335647584, "eval_sampling/importance_sampling_ratio/mean": 0.9999662327766419, "eval_sampling/importance_sampling_ratio/min": 0.3151546062529087, "eval_sampling/sampling_logp_difference/max": 1.3373068857192993, "eval_sampling/sampling_logp_difference/mean": 0.01363888442516327, "eval_steps_per_second": 0.004, "step": 4200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1647.2, "completions/max_terminated_length": 1647.2, "completions/mean_length": 1198.703125, "completions/mean_terminated_length": 1198.703125, "completions/min_length": 915.4, "completions/min_terminated_length": 915.4, "entropy": 0.2633301138877869, "epoch": 4.941245593419507, "frac_reward_zero_std": 0.65, "grad_norm": 0.0, "learning_rate": 6.1788223891446564e-09, "loss": -0.0032, "num_tokens": 578373168.0, "reward": 0.9223958492279053, "reward_std": 0.08152852952480316, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9223958492279053, "rewards/e2e_recall_precision_mixed_reward/std": 0.152499720454216, "sampling/importance_sampling_ratio/max": 1.9259612321853639, "sampling/importance_sampling_ratio/mean": 0.9999021768569947, "sampling/importance_sampling_ratio/min": 0.39434434175491334, "sampling/sampling_logp_difference/max": 1.0410821914672852, "sampling/sampling_logp_difference/mean": 0.013154360838234425, "step": 4205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1669.0, "completions/max_terminated_length": 1669.0, "completions/mean_length": 1275.25, "completions/mean_terminated_length": 1275.25, "completions/min_length": 1009.8, "completions/min_terminated_length": 1009.8, "entropy": 0.2745696842670441, "epoch": 4.947121034077556, "frac_reward_zero_std": 0.55, "grad_norm": 0.5402174592018127, "learning_rate": 5.573055488248122e-09, "loss": 0.0021, "num_tokens": 579086176.0, "reward": 0.9057291746139526, "reward_std": 0.06898038685321808, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9057291746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.16145955622196198, "sampling/importance_sampling_ratio/max": 1.9250396013259887, "sampling/importance_sampling_ratio/mean": 0.9999170303344727, "sampling/importance_sampling_ratio/min": 0.3829080641269684, "sampling/sampling_logp_difference/max": 0.9843248128890991, "sampling/sampling_logp_difference/mean": 0.013959725014865398, "step": 4210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1524.2, "completions/max_terminated_length": 1524.2, "completions/mean_length": 1170.28125, "completions/mean_terminated_length": 1170.28125, "completions/min_length": 893.8, "completions/min_terminated_length": 893.8, "entropy": 0.2561034023761749, "epoch": 4.952996474735605, "frac_reward_zero_std": 0.85, "grad_norm": 0.0, "learning_rate": 4.967288587351586e-09, "loss": -0.0009, "num_tokens": 579767418.0, "reward": 0.9182291746139526, "reward_std": 0.02048187479376793, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9182291746139526, "rewards/e2e_recall_precision_mixed_reward/std": 0.16392614617943763, "sampling/importance_sampling_ratio/max": 1.9762834310531616, "sampling/importance_sampling_ratio/mean": 0.999981677532196, "sampling/importance_sampling_ratio/min": 0.34127419590950014, "sampling/sampling_logp_difference/max": 1.1679274797439576, "sampling/sampling_logp_difference/mean": 0.013132588565349579, "step": 4215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1799.6, "completions/max_terminated_length": 1799.6, "completions/mean_length": 1242.621875, "completions/mean_terminated_length": 1242.621875, "completions/min_length": 910.2, "completions/min_terminated_length": 910.2, "entropy": 0.2759708225727081, "epoch": 4.9588719153936545, "frac_reward_zero_std": 0.5, "grad_norm": 0.43732741475105286, "learning_rate": 4.361521686455052e-09, "loss": 0.0079, "num_tokens": 580478561.0, "reward": 0.8062500119209289, "reward_std": 0.10620709657669067, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8062500119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.26027744710445405, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999176859855652, "sampling/importance_sampling_ratio/min": 0.3623032122850418, "sampling/sampling_logp_difference/max": 1.5115676403045655, "sampling/sampling_logp_difference/mean": 0.013757564499974251, "step": 4220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1617.6, "completions/max_terminated_length": 1617.6, "completions/mean_length": 1131.6875, "completions/mean_terminated_length": 1131.6875, "completions/min_length": 771.8, "completions/min_terminated_length": 771.8, "entropy": 0.2657478004693985, "epoch": 4.964747356051704, "frac_reward_zero_std": 0.6, "grad_norm": 0.5924484133720398, "learning_rate": 3.755754785558517e-09, "loss": -0.0043, "num_tokens": 581201549.0, "reward": 0.8257812619209289, "reward_std": 0.08197875022888183, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8257812619209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.23682957887649536, "sampling/importance_sampling_ratio/max": 1.966193675994873, "sampling/importance_sampling_ratio/mean": 0.9998813509941101, "sampling/importance_sampling_ratio/min": 0.3465057075023651, "sampling/sampling_logp_difference/max": 1.3918760061264037, "sampling/sampling_logp_difference/mean": 0.013801524788141251, "step": 4225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1520.8, "completions/max_terminated_length": 1520.8, "completions/mean_length": 1192.865625, "completions/mean_terminated_length": 1192.865625, "completions/min_length": 935.4, "completions/min_terminated_length": 935.4, "entropy": 0.26043239533901213, "epoch": 4.970622796709753, "frac_reward_zero_std": 0.65, "grad_norm": 0.42304831743240356, "learning_rate": 3.149987884661982e-09, "loss": -0.0002, "num_tokens": 581893074.0, "reward": 0.887500011920929, "reward_std": 0.05844337120652199, "rewards/e2e_recall_precision_mixed_reward/mean": 0.887500011920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.21982333362102507, "sampling/importance_sampling_ratio/max": 1.8982154846191406, "sampling/importance_sampling_ratio/mean": 0.9999570608139038, "sampling/importance_sampling_ratio/min": 0.3804735541343689, "sampling/sampling_logp_difference/max": 1.2776761054992676, "sampling/sampling_logp_difference/mean": 0.013370229117572307, "step": 4230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1726.0, "completions/max_terminated_length": 1726.0, "completions/mean_length": 1219.128125, "completions/mean_terminated_length": 1219.128125, "completions/min_length": 872.2, "completions/min_terminated_length": 872.2, "entropy": 0.25902561843395233, "epoch": 4.976498237367803, "frac_reward_zero_std": 0.65, "grad_norm": 0.0, "learning_rate": 2.544220983765447e-09, "loss": -0.002, "num_tokens": 582591003.0, "reward": 0.8218750119209289, "reward_std": 0.07310913950204849, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8218750119209289, "rewards/e2e_recall_precision_mixed_reward/std": 0.28861500322818756, "sampling/importance_sampling_ratio/max": 1.9743704080581665, "sampling/importance_sampling_ratio/mean": 0.9999637246131897, "sampling/importance_sampling_ratio/min": 0.4172662615776062, "sampling/sampling_logp_difference/max": 0.8795746564865112, "sampling/sampling_logp_difference/mean": 0.013227641209959983, "step": 4235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1684.2, "completions/max_terminated_length": 1684.2, "completions/mean_length": 1250.65, "completions/mean_terminated_length": 1250.65, "completions/min_length": 897.6, "completions/min_terminated_length": 897.6, "entropy": 0.27604796886444094, "epoch": 4.982373678025852, "frac_reward_zero_std": 0.7, "grad_norm": 0.0, "learning_rate": 1.938454082868912e-09, "loss": 0.0021, "num_tokens": 583305659.0, "reward": 0.9108333349227905, "reward_std": 0.06361775994300842, "rewards/e2e_recall_precision_mixed_reward/mean": 0.9108333349227905, "rewards/e2e_recall_precision_mixed_reward/std": 0.14266837537288665, "sampling/importance_sampling_ratio/max": 1.9794315576553345, "sampling/importance_sampling_ratio/mean": 0.9999402523040771, "sampling/importance_sampling_ratio/min": 0.3792805254459381, "sampling/sampling_logp_difference/max": 1.2729737639427186, "sampling/sampling_logp_difference/mean": 0.013952986150979996, "step": 4240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1632.8, "completions/max_terminated_length": 1632.8, "completions/mean_length": 1154.265625, "completions/mean_terminated_length": 1154.265625, "completions/min_length": 861.8, "completions/min_terminated_length": 861.8, "entropy": 0.2639856070280075, "epoch": 4.9882491186839015, "frac_reward_zero_std": 0.7, "grad_norm": 0.4559522569179535, "learning_rate": 1.332687181972377e-09, "loss": 0.0018, "num_tokens": 583989680.0, "reward": 0.879687511920929, "reward_std": 0.04789134860038757, "rewards/e2e_recall_precision_mixed_reward/mean": 0.879687511920929, "rewards/e2e_recall_precision_mixed_reward/std": 0.17645513415336608, "sampling/importance_sampling_ratio/max": 1.9888315200805664, "sampling/importance_sampling_ratio/mean": 0.9999840378761291, "sampling/importance_sampling_ratio/min": 0.39148822128772737, "sampling/sampling_logp_difference/max": 1.0263358354568481, "sampling/sampling_logp_difference/mean": 0.01353690456598997, "step": 4245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1750.6, "completions/max_terminated_length": 1740.6, "completions/mean_length": 1204.640625, "completions/mean_terminated_length": 1201.0263427734376, "completions/min_length": 954.0, "completions/min_terminated_length": 954.0, "entropy": 0.2623937726020813, "epoch": 4.994124559341951, "frac_reward_zero_std": 0.55, "grad_norm": 0.6408044099807739, "learning_rate": 7.26920281075842e-10, "loss": -0.0026, "num_tokens": 584716969.0, "reward": 0.8661458373069764, "reward_std": 0.09312712252140046, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8661458373069764, "rewards/e2e_recall_precision_mixed_reward/std": 0.25531432032585144, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999866724014282, "sampling/importance_sampling_ratio/min": 0.3677119523286819, "sampling/sampling_logp_difference/max": 1.0782905578613282, "sampling/sampling_logp_difference/mean": 0.013216838613152504, "step": 4250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 1752.6, "completions/max_terminated_length": 1748.2, "completions/mean_length": 1165.646875, "completions/mean_terminated_length": 1150.09833984375, "completions/min_length": 852.2, "completions/min_terminated_length": 852.2, "entropy": 0.2466509908437729, "epoch": 5.0, "frac_reward_zero_std": 0.5, "grad_norm": 0.45128658413887024, "learning_rate": 1.21153380179307e-10, "loss": -0.0331, "num_tokens": 585407944.0, "reward": 0.8033854246139527, "reward_std": 0.09269858747720719, "rewards/e2e_recall_precision_mixed_reward/mean": 0.8033854246139527, "rewards/e2e_recall_precision_mixed_reward/std": 0.2835982650518417, "sampling/importance_sampling_ratio/max": 1.890273141860962, "sampling/importance_sampling_ratio/mean": 1.000039005279541, "sampling/importance_sampling_ratio/min": 0.3465990424156189, "sampling/sampling_logp_difference/max": 1.0836182117462159, "sampling/sampling_logp_difference/mean": 0.01285779345780611, "step": 4255 }, { "epoch": 5.0, "step": 4255, "total_flos": 0.0, "train_loss": 0.0002628941724753198, "train_runtime": 110827.1867, "train_samples_per_second": 0.154, "train_steps_per_second": 0.038 } ], "logging_steps": 5, "max_steps": 4255, "num_input_tokens_seen": 585407944, "num_train_epochs": 5, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }